xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 82a78a4ef1c3dde9953e002d5a85008393775538)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
699acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
79ae82921SPaul Mullowney 
83d13b8fdSMatthew G. Knepley #include <petscconf.h>
93d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
10087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
113d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
12af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
139ae82921SPaul Mullowney #undef VecType
143d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
15a2cee5feSJed Brown #include <thrust/adjacent_difference.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17a2cee5feSJed Brown #include <thrust/iterator/constant_iterator.h>
18a2cee5feSJed Brown #include <thrust/remove.h>
19a2cee5feSJed Brown #include <thrust/sort.h>
20a2cee5feSJed Brown #include <thrust/unique.h>
21e8d2b73aSMark Adams 
22e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
23afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
24afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
25afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
26afb2bd1cSJunchao Zhang 
27afb2bd1cSJunchao Zhang   typedef enum {
28afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
29afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
30afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
31afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
32afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
33afb2bd1cSJunchao Zhang 
34afb2bd1cSJunchao Zhang   typedef enum {
35afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
45afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
46afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
47afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
48afb2bd1cSJunchao Zhang 
49afb2bd1cSJunchao Zhang   typedef enum {
50afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
51afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
52afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
53afb2bd1cSJunchao Zhang   */
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
55afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
56afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
57afb2bd1cSJunchao Zhang #endif
589ae82921SPaul Mullowney 
59087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
60087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
61087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
877f756511SDominic Meiser 
88042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
90a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9157181aedSStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93c215019aSStefano Zampini 
94b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95b06137fdSPaul Mullowney {
96b06137fdSPaul Mullowney   cusparseStatus_t   stat;
97b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98b06137fdSPaul Mullowney 
99b06137fdSPaul Mullowney   PetscFunctionBegin;
100d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10257d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103b06137fdSPaul Mullowney   PetscFunctionReturn(0);
104b06137fdSPaul Mullowney }
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107b06137fdSPaul Mullowney {
108b06137fdSPaul Mullowney   cusparseStatus_t   stat;
109b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110b06137fdSPaul Mullowney 
111b06137fdSPaul Mullowney   PetscFunctionBegin;
112d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1136b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11416a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11557d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11616a2e217SAlejandro Lamas Daviña     }
117b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1186b1cf21dSAlejandro Lamas Daviña   }
11957d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120b06137fdSPaul Mullowney   PetscFunctionReturn(0);
121b06137fdSPaul Mullowney }
122b06137fdSPaul Mullowney 
123b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124b06137fdSPaul Mullowney {
125b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1267e8381f9SStefano Zampini   PetscBool          flg;
1277e8381f9SStefano Zampini   PetscErrorCode     ierr;
128ccdfe979SStefano Zampini 
129b06137fdSPaul Mullowney   PetscFunctionBegin;
1307e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1317e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133b06137fdSPaul Mullowney   PetscFunctionReturn(0);
134b06137fdSPaul Mullowney }
135b06137fdSPaul Mullowney 
136ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1379ae82921SPaul Mullowney {
1389ae82921SPaul Mullowney   PetscFunctionBegin;
1399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1409ae82921SPaul Mullowney   PetscFunctionReturn(0);
1419ae82921SPaul Mullowney }
1429ae82921SPaul Mullowney 
143c708e6cdSJed Brown /*MC
144087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
150c708e6cdSJed Brown 
1519ae82921SPaul Mullowney   Level: beginner
152c708e6cdSJed Brown 
1533ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154c708e6cdSJed Brown M*/
1559ae82921SPaul Mullowney 
15642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1579ae82921SPaul Mullowney {
1589ae82921SPaul Mullowney   PetscErrorCode ierr;
159bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1609ae82921SPaul Mullowney 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1642c7c0729SBarry Smith   (*B)->factortype = ftype;
1659ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1662205254eSKarl Rupp 
1679c1083e7SRichard Tran Mills   if (A->boundtocpu && A->bindingpropagates) { ierr = MatBindToCPU(*B,PETSC_TRUE);CHKERRQ(ierr); }
168087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16933d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1709c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
1719ae82921SPaul Mullowney       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1729ae82921SPaul Mullowney       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
1739c1083e7SRichard Tran Mills     } else {
1749c1083e7SRichard Tran Mills       (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ;
1759c1083e7SRichard Tran Mills       (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJ;
1769c1083e7SRichard Tran Mills     }
1774ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
1784ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
1794ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
180087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
1819c1083e7SRichard Tran Mills     if (!A->boundtocpu) {
182087f3262SPaul Mullowney       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
183087f3262SPaul Mullowney       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1849c1083e7SRichard Tran Mills     } else {
1859c1083e7SRichard Tran Mills       (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJ;
1869c1083e7SRichard Tran Mills       (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ;
1879c1083e7SRichard Tran Mills     }
1884ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
1894ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1909ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
191bc3f50f2SPaul Mullowney 
192fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1934ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1943ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1959ae82921SPaul Mullowney   PetscFunctionReturn(0);
1969ae82921SPaul Mullowney }
1979ae82921SPaul Mullowney 
198bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
199ca45077fSPaul Mullowney {
200aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2016e111a19SKarl Rupp 
202ca45077fSPaul Mullowney   PetscFunctionBegin;
203ca45077fSPaul Mullowney   switch (op) {
204e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
205aa372e3fSPaul Mullowney     cusparsestruct->format = format;
206ca45077fSPaul Mullowney     break;
207e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
208aa372e3fSPaul Mullowney     cusparsestruct->format = format;
209ca45077fSPaul Mullowney     break;
210ca45077fSPaul Mullowney   default:
21136d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
212ca45077fSPaul Mullowney   }
213ca45077fSPaul Mullowney   PetscFunctionReturn(0);
214ca45077fSPaul Mullowney }
2159ae82921SPaul Mullowney 
216e057df02SPaul Mullowney /*@
217e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
218e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
219aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
220e057df02SPaul Mullowney    Not Collective
221e057df02SPaul Mullowney 
222e057df02SPaul Mullowney    Input Parameters:
2238468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
22436d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2252692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
226e057df02SPaul Mullowney 
227e057df02SPaul Mullowney    Output Parameter:
228e057df02SPaul Mullowney 
229e057df02SPaul Mullowney    Level: intermediate
230e057df02SPaul Mullowney 
2318468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
232e057df02SPaul Mullowney @*/
233e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
234e057df02SPaul Mullowney {
235e057df02SPaul Mullowney   PetscErrorCode ierr;
2366e111a19SKarl Rupp 
237e057df02SPaul Mullowney   PetscFunctionBegin;
238e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
239e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
240e057df02SPaul Mullowney   PetscFunctionReturn(0);
241e057df02SPaul Mullowney }
242e057df02SPaul Mullowney 
243365b711fSMark Adams PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A,PetscBool use_cpu)
244365b711fSMark Adams {
245365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
246365b711fSMark Adams 
247365b711fSMark Adams   PetscFunctionBegin;
248365b711fSMark Adams   cusparsestruct->use_cpu_solve = use_cpu;
249365b711fSMark Adams   PetscFunctionReturn(0);
250365b711fSMark Adams }
251365b711fSMark Adams 
252365b711fSMark Adams /*@
253365b711fSMark Adams    MatCUSPARSESetUseCPUSolve - Sets use CPU MatSolve.
254365b711fSMark Adams 
255365b711fSMark Adams    Input Parameters:
256365b711fSMark Adams +  A - Matrix of type SEQAIJCUSPARSE
257365b711fSMark Adams -  use_cpu - set flag for using the built-in CPU MatSolve
258365b711fSMark Adams 
259365b711fSMark Adams    Output Parameter:
260365b711fSMark Adams 
261365b711fSMark Adams    Notes:
262365b711fSMark Adams    The cuSparse LU solver currently computes the factors with the built-in CPU method
263365b711fSMark Adams    and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and computing the solve there.
264365b711fSMark Adams    This method to specify if the solve is done on the CPU or GPU (GPU is the default).
265365b711fSMark Adams 
266365b711fSMark Adams    Level: intermediate
267365b711fSMark Adams 
268365b711fSMark Adams .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
269365b711fSMark Adams @*/
270365b711fSMark Adams PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A,PetscBool use_cpu)
271365b711fSMark Adams {
272365b711fSMark Adams   PetscErrorCode ierr;
273365b711fSMark Adams 
274365b711fSMark Adams   PetscFunctionBegin;
275365b711fSMark Adams   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
276365b711fSMark Adams   ierr = PetscTryMethod(A,"MatCUSPARSESetUseCPUSolve_C",(Mat,PetscBool),(A,use_cpu));CHKERRQ(ierr);
277365b711fSMark Adams   PetscFunctionReturn(0);
278365b711fSMark Adams }
279365b711fSMark Adams 
2801a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
281e6e9a74fSStefano Zampini {
282e6e9a74fSStefano Zampini   PetscErrorCode ierr;
283e6e9a74fSStefano Zampini 
284e6e9a74fSStefano Zampini   PetscFunctionBegin;
2851a2c6b5cSJunchao Zhang   switch (op) {
2861a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2871a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2881a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2891a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2901a2c6b5cSJunchao Zhang       break;
2911a2c6b5cSJunchao Zhang     default:
2921a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2931a2c6b5cSJunchao Zhang       break;
294e6e9a74fSStefano Zampini   }
295e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
296e6e9a74fSStefano Zampini }
297e6e9a74fSStefano Zampini 
298bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
299bddcd29dSMark Adams 
300bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
301bddcd29dSMark Adams {
302bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
303bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
304bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
305365b711fSMark Adams   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)B->spptr;
306bddcd29dSMark Adams   PetscErrorCode ierr;
307bddcd29dSMark Adams 
308bddcd29dSMark Adams   PetscFunctionBegin;
309bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
310bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
311bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
312bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
313bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
314bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
315bddcd29dSMark Adams   if (row_identity && col_identity) {
316365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
317bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
318bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
319365b711fSMark Adams     }
320bddcd29dSMark Adams     B->ops->matsolve = NULL;
321bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
322bddcd29dSMark Adams   } else {
323365b711fSMark Adams     if (!cusparsestruct->use_cpu_solve) {
324bddcd29dSMark Adams       B->ops->solve = MatSolve_SeqAIJCUSPARSE;
325bddcd29dSMark Adams       B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
326365b711fSMark Adams     }
327bddcd29dSMark Adams     B->ops->matsolve = NULL;
328bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
329bddcd29dSMark Adams   }
330bddcd29dSMark Adams 
331bddcd29dSMark Adams   /* get the triangular factors */
332365b711fSMark Adams   if (!cusparsestruct->use_cpu_solve) {
333bddcd29dSMark Adams     ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
334365b711fSMark Adams   }
335bddcd29dSMark Adams   PetscFunctionReturn(0);
336bddcd29dSMark Adams }
337bddcd29dSMark Adams 
3384416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
3399ae82921SPaul Mullowney {
3409ae82921SPaul Mullowney   PetscErrorCode           ierr;
341e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
3429ae82921SPaul Mullowney   PetscBool                flg;
343a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
3446e111a19SKarl Rupp 
3459ae82921SPaul Mullowney   PetscFunctionBegin;
346e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
3479ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
348e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
349a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
350afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
351afb2bd1cSJunchao Zhang 
3524c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
353a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
354afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
355365b711fSMark Adams     ierr = PetscOptionsBool("-mat_cusparse_use_cpu_solve","Use CPU (I)LU solve","MatCUSPARSESetUseCPUSolve",cusparsestruct->use_cpu_solve,&cusparsestruct->use_cpu_solve,&flg);CHKERRQ(ierr);
356365b711fSMark Adams     if (flg) {ierr = MatCUSPARSESetUseCPUSolve(A,cusparsestruct->use_cpu_solve);CHKERRQ(ierr);}
357afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
358afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
359afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
360afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
361a435da06SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
362a435da06SStefano Zampini     if (flg && CUSPARSE_SPMV_CSR_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
363a435da06SStefano Zampini #else
364afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
365a435da06SStefano Zampini #endif
366afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
367afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
368afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
369afb2bd1cSJunchao Zhang 
370afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
371afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
372afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
373afb2bd1cSJunchao Zhang    #endif
3744c87dfd4SPaul Mullowney   }
3750af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3769ae82921SPaul Mullowney   PetscFunctionReturn(0);
3779ae82921SPaul Mullowney }
3789ae82921SPaul Mullowney 
3796fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3809ae82921SPaul Mullowney {
381da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3829ae82921SPaul Mullowney   PetscErrorCode               ierr;
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney   PetscFunctionBegin;
385da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3869ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3879ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3889ae82921SPaul Mullowney   PetscFunctionReturn(0);
3899ae82921SPaul Mullowney }
3909ae82921SPaul Mullowney 
3916fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3929ae82921SPaul Mullowney {
393da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3949ae82921SPaul Mullowney   PetscErrorCode               ierr;
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney   PetscFunctionBegin;
397da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3989ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3999ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
4009ae82921SPaul Mullowney   PetscFunctionReturn(0);
4019ae82921SPaul Mullowney }
4029ae82921SPaul Mullowney 
403087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
404087f3262SPaul Mullowney {
405da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
406087f3262SPaul Mullowney   PetscErrorCode               ierr;
407087f3262SPaul Mullowney 
408087f3262SPaul Mullowney   PetscFunctionBegin;
409da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
410087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
411087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
412087f3262SPaul Mullowney   PetscFunctionReturn(0);
413087f3262SPaul Mullowney }
414087f3262SPaul Mullowney 
415087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
416087f3262SPaul Mullowney {
417da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
418087f3262SPaul Mullowney   PetscErrorCode               ierr;
419087f3262SPaul Mullowney 
420087f3262SPaul Mullowney   PetscFunctionBegin;
421da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
422087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
423087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
424087f3262SPaul Mullowney   PetscFunctionReturn(0);
425087f3262SPaul Mullowney }
426087f3262SPaul Mullowney 
427087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
4289ae82921SPaul Mullowney {
4299ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4309ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4319ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
432aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
4339ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4349ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
4359ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4369ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
4379ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
438b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
43957d48284SJunchao Zhang   cudaError_t                       cerr;
4409ae82921SPaul Mullowney 
4419ae82921SPaul Mullowney   PetscFunctionBegin;
442cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
443c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4449ae82921SPaul Mullowney     try {
4459ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
4469ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
447da79fbbcSStefano Zampini       if (!loTriFactor) {
4482cbc15d9SMark         PetscScalar                       *AALo;
4492cbc15d9SMark 
4502cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4519ae82921SPaul Mullowney 
4529ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
45357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
45457d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
4559ae82921SPaul Mullowney 
4569ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4579ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4589ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4599ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4609ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4619ae82921SPaul Mullowney         v        = aa;
4629ae82921SPaul Mullowney         vi       = aj;
4639ae82921SPaul Mullowney         offset   = 1;
4649ae82921SPaul Mullowney         rowOffset= 1;
4659ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4669ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
467e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4689ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4699ae82921SPaul Mullowney           rowOffset += nz+1;
4709ae82921SPaul Mullowney 
471580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
472580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4739ae82921SPaul Mullowney 
4749ae82921SPaul Mullowney           offset      += nz;
4759ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4769ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4779ae82921SPaul Mullowney           offset      += 1;
4789ae82921SPaul Mullowney 
4799ae82921SPaul Mullowney           v  += nz;
4809ae82921SPaul Mullowney           vi += nz;
4819ae82921SPaul Mullowney         }
4822205254eSKarl Rupp 
483aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
484da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
485da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
486aa372e3fSPaul Mullowney         /* Create the matrix description */
48757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
48857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4891b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
490afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
491afb2bd1cSJunchao Zhang        #else
49257d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
493afb2bd1cSJunchao Zhang        #endif
49457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
49557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
496aa372e3fSPaul Mullowney 
497aa372e3fSPaul Mullowney         /* set the operation */
498aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
499aa372e3fSPaul Mullowney 
500aa372e3fSPaul Mullowney         /* set the matrix */
501aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
502aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
503aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
504aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
505aa372e3fSPaul Mullowney 
506aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
507aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
508aa372e3fSPaul Mullowney 
509aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
510aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
511aa372e3fSPaul Mullowney 
512aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
513aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
514aa372e3fSPaul Mullowney 
515afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
516da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
517afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5181b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
519afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
520afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
521afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
522afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
523afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
524afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
525afb2bd1cSJunchao Zhang       #endif
526afb2bd1cSJunchao Zhang 
527aa372e3fSPaul Mullowney         /* perform the solve analysis */
528aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
529aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
530aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
531d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
5321b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
533d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
534d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
535d49cd2b7SBarry Smith                                #else
536d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
537afb2bd1cSJunchao Zhang                                #endif
538da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
539da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
540aa372e3fSPaul Mullowney 
541da79fbbcSStefano Zampini         /* assign the pointer */
542aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
5432cbc15d9SMark         loTriFactor->AA_h = AALo;
54457d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
54557d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
5464863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
547da79fbbcSStefano Zampini       } else { /* update values only */
5482cbc15d9SMark         if (!loTriFactor->AA_h) {
5492cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
5502cbc15d9SMark         }
551da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
5522cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
553da79fbbcSStefano Zampini         v        = aa;
554da79fbbcSStefano Zampini         vi       = aj;
555da79fbbcSStefano Zampini         offset   = 1;
556da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
557da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5582cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
559da79fbbcSStefano Zampini           offset      += nz;
5602cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
561da79fbbcSStefano Zampini           offset      += 1;
562da79fbbcSStefano Zampini           v  += nz;
563da79fbbcSStefano Zampini         }
5642cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
565da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
566da79fbbcSStefano Zampini       }
5679ae82921SPaul Mullowney     } catch(char *ex) {
5689ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5699ae82921SPaul Mullowney     }
5709ae82921SPaul Mullowney   }
5719ae82921SPaul Mullowney   PetscFunctionReturn(0);
5729ae82921SPaul Mullowney }
5739ae82921SPaul Mullowney 
574087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5759ae82921SPaul Mullowney {
5769ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5779ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
579aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5809ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5819ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5829ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5839ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5849ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5859ae82921SPaul Mullowney   PetscErrorCode                    ierr;
58657d48284SJunchao Zhang   cudaError_t                       cerr;
5879ae82921SPaul Mullowney 
5889ae82921SPaul Mullowney   PetscFunctionBegin;
589cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
590c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5919ae82921SPaul Mullowney     try {
5929ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5939ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
594da79fbbcSStefano Zampini       if (!upTriFactor) {
5952cbc15d9SMark         PetscScalar *AAUp;
5962cbc15d9SMark 
5972cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5982cbc15d9SMark 
5999ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
60057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
60157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
6029ae82921SPaul Mullowney 
6039ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
6049ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
6059ae82921SPaul Mullowney         AiUp[n]=nzUpper;
6069ae82921SPaul Mullowney         offset = nzUpper;
6079ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
6089ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
6099ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
6109ae82921SPaul Mullowney 
611e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
6129ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
6139ae82921SPaul Mullowney 
614e057df02SPaul Mullowney           /* decrement the offset */
6159ae82921SPaul Mullowney           offset -= (nz+1);
6169ae82921SPaul Mullowney 
617e057df02SPaul Mullowney           /* first, set the diagonal elements */
6189ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
61909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
6209ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
6219ae82921SPaul Mullowney 
622580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
623580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
6249ae82921SPaul Mullowney         }
6252205254eSKarl Rupp 
626aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
627da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
628da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
6292205254eSKarl Rupp 
630aa372e3fSPaul Mullowney         /* Create the matrix description */
63157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
63257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
6331b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
634afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
635afb2bd1cSJunchao Zhang        #else
63657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
637afb2bd1cSJunchao Zhang        #endif
63857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
63957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
640aa372e3fSPaul Mullowney 
641aa372e3fSPaul Mullowney         /* set the operation */
642aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
643aa372e3fSPaul Mullowney 
644aa372e3fSPaul Mullowney         /* set the matrix */
645aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
646aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
647aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
648aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
649aa372e3fSPaul Mullowney 
650aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
651aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
652aa372e3fSPaul Mullowney 
653aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
654aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
655aa372e3fSPaul Mullowney 
656aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
657aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
658aa372e3fSPaul Mullowney 
659afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
660da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
661afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6621b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
663afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
664afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
665afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
666afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
667afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
668afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
669afb2bd1cSJunchao Zhang       #endif
670afb2bd1cSJunchao Zhang 
671aa372e3fSPaul Mullowney         /* perform the solve analysis */
672aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
673aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
674aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
675d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
6761b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
677d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
678d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
679d49cd2b7SBarry Smith                                #else
680d49cd2b7SBarry Smith                                  upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
681afb2bd1cSJunchao Zhang                                #endif
682da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
683da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
684aa372e3fSPaul Mullowney 
685da79fbbcSStefano Zampini         /* assign the pointer */
686aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6872cbc15d9SMark         upTriFactor->AA_h = AAUp;
68857d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
68957d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6904863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
691da79fbbcSStefano Zampini       } else {
6922cbc15d9SMark         if (!upTriFactor->AA_h) {
6932cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6942cbc15d9SMark         }
695da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
696da79fbbcSStefano Zampini         offset = nzUpper;
697da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
698da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
699da79fbbcSStefano Zampini 
700da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
701da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
702da79fbbcSStefano Zampini 
703da79fbbcSStefano Zampini           /* decrement the offset */
704da79fbbcSStefano Zampini           offset -= (nz+1);
705da79fbbcSStefano Zampini 
706da79fbbcSStefano Zampini           /* first, set the diagonal elements */
7072cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
7082cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
709da79fbbcSStefano Zampini         }
7102cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
711da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
712da79fbbcSStefano Zampini       }
7139ae82921SPaul Mullowney     } catch(char *ex) {
7149ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
7159ae82921SPaul Mullowney     }
7169ae82921SPaul Mullowney   }
7179ae82921SPaul Mullowney   PetscFunctionReturn(0);
7189ae82921SPaul Mullowney }
7199ae82921SPaul Mullowney 
720087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
7219ae82921SPaul Mullowney {
7229ae82921SPaul Mullowney   PetscErrorCode               ierr;
7239ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
7249ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
7259ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
7269ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
7279ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
7289ae82921SPaul Mullowney 
7299ae82921SPaul Mullowney   PetscFunctionBegin;
730da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
731087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
732087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
7332205254eSKarl Rupp 
734da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
735aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
7369ae82921SPaul Mullowney 
737c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
738e057df02SPaul Mullowney   /* lower triangular indices */
7399ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
740da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
741da79fbbcSStefano Zampini     const PetscInt *r;
742da79fbbcSStefano Zampini 
743da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
744aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
745aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
7469ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
747da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
748da79fbbcSStefano Zampini   }
7499ae82921SPaul Mullowney 
750e057df02SPaul Mullowney   /* upper triangular indices */
7519ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
752da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
753da79fbbcSStefano Zampini     const PetscInt *c;
754da79fbbcSStefano Zampini 
755da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
756aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
757aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
7589ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
759da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
760da79fbbcSStefano Zampini   }
7619ae82921SPaul Mullowney   PetscFunctionReturn(0);
7629ae82921SPaul Mullowney }
7639ae82921SPaul Mullowney 
764087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
765087f3262SPaul Mullowney {
766087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
767087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
768aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
769aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
770087f3262SPaul Mullowney   cusparseStatus_t                  stat;
771087f3262SPaul Mullowney   PetscErrorCode                    ierr;
77257d48284SJunchao Zhang   cudaError_t                       cerr;
773087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
774087f3262SPaul Mullowney   PetscScalar                       *AAUp;
775087f3262SPaul Mullowney   PetscScalar                       *AALo;
776087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
777087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
778087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
779087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
780087f3262SPaul Mullowney 
781087f3262SPaul Mullowney   PetscFunctionBegin;
782cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
783c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
784087f3262SPaul Mullowney     try {
785da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
786da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
787da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
788087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
78957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
79057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
791087f3262SPaul Mullowney 
792087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
793087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
794087f3262SPaul Mullowney         AiUp[n]=nzUpper;
795087f3262SPaul Mullowney         offset = 0;
796087f3262SPaul Mullowney         for (i=0; i<n; i++) {
797087f3262SPaul Mullowney           /* set the pointers */
798087f3262SPaul Mullowney           v  = aa + ai[i];
799087f3262SPaul Mullowney           vj = aj + ai[i];
800087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
801087f3262SPaul Mullowney 
802087f3262SPaul Mullowney           /* first, set the diagonal elements */
803087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
80409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
805087f3262SPaul Mullowney           AiUp[i]      = offset;
80609f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
807087f3262SPaul Mullowney 
808087f3262SPaul Mullowney           offset+=1;
809087f3262SPaul Mullowney           if (nz>0) {
810f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
811580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
812087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
813087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
814087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
815087f3262SPaul Mullowney             }
816087f3262SPaul Mullowney             offset+=nz;
817087f3262SPaul Mullowney           }
818087f3262SPaul Mullowney         }
819087f3262SPaul Mullowney 
820aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
821da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
822da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823087f3262SPaul Mullowney 
824aa372e3fSPaul Mullowney         /* Create the matrix description */
82557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
82657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8271b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829afb2bd1cSJunchao Zhang        #else
83057d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831afb2bd1cSJunchao Zhang        #endif
83257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
834087f3262SPaul Mullowney 
835aa372e3fSPaul Mullowney         /* set the matrix */
836aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
837aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
838aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
839aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
840aa372e3fSPaul Mullowney 
841aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
842aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
845aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
848aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
849aa372e3fSPaul Mullowney 
850afb2bd1cSJunchao Zhang         /* set the operation */
851afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
852afb2bd1cSJunchao Zhang 
853afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
854da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8561b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
858afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
859afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
860afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
861afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863afb2bd1cSJunchao Zhang       #endif
864afb2bd1cSJunchao Zhang 
865aa372e3fSPaul Mullowney         /* perform the solve analysis */
866aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
867aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
868aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
869d49cd2b7SBarry Smith                                  upTriFactor->csrMat->column_indices->data().get(),
8701b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871d49cd2b7SBarry Smith                                  upTriFactor->solveInfo,
872d49cd2b7SBarry Smith                                  upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
873d49cd2b7SBarry Smith                                 #else
874d49cd2b7SBarry Smith                                   upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
875afb2bd1cSJunchao Zhang                                 #endif
876da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
877da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
878aa372e3fSPaul Mullowney 
879da79fbbcSStefano Zampini         /* assign the pointer */
880aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
881aa372e3fSPaul Mullowney 
882aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
883da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
884da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
885aa372e3fSPaul Mullowney 
886aa372e3fSPaul Mullowney         /* Create the matrix description */
88757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
88857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8891b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
890afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
891afb2bd1cSJunchao Zhang        #else
89257d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
893afb2bd1cSJunchao Zhang        #endif
89457d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
89557d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
896aa372e3fSPaul Mullowney 
897aa372e3fSPaul Mullowney         /* set the operation */
898aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
899aa372e3fSPaul Mullowney 
900aa372e3fSPaul Mullowney         /* set the matrix */
901aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
902aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
903aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
904aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
905aa372e3fSPaul Mullowney 
906aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
907aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
908aa372e3fSPaul Mullowney 
909aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
910aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
911aa372e3fSPaul Mullowney 
912aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
913aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
914aa372e3fSPaul Mullowney 
915afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
916da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
917afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
9181b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
919afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
920afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
921afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
922afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
923afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
924afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
925afb2bd1cSJunchao Zhang       #endif
926afb2bd1cSJunchao Zhang 
927aa372e3fSPaul Mullowney         /* perform the solve analysis */
928aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
929aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
930aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
931d49cd2b7SBarry Smith                                  loTriFactor->csrMat->column_indices->data().get(),
9321b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
933d49cd2b7SBarry Smith                                  loTriFactor->solveInfo,
934d49cd2b7SBarry Smith                                  loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
935d49cd2b7SBarry Smith                                 #else
936d49cd2b7SBarry Smith                                  loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
937afb2bd1cSJunchao Zhang                                 #endif
938da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
939da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
940aa372e3fSPaul Mullowney 
941da79fbbcSStefano Zampini         /* assign the pointer */
942aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
943087f3262SPaul Mullowney 
944da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
94557d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
94657d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
947da79fbbcSStefano Zampini       } else {
948da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
949da79fbbcSStefano Zampini         offset = 0;
950da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
951da79fbbcSStefano Zampini           /* set the pointers */
952da79fbbcSStefano Zampini           v  = aa + ai[i];
953da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
954da79fbbcSStefano Zampini 
955da79fbbcSStefano Zampini           /* first, set the diagonal elements */
956da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
957da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
958da79fbbcSStefano Zampini 
959da79fbbcSStefano Zampini           offset+=1;
960da79fbbcSStefano Zampini           if (nz>0) {
961da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
962da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
963da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
964da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
965da79fbbcSStefano Zampini             }
966da79fbbcSStefano Zampini             offset+=nz;
967da79fbbcSStefano Zampini           }
968da79fbbcSStefano Zampini         }
969da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
970da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
971da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
972da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
973da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
974da79fbbcSStefano Zampini       }
97557d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
97657d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
977087f3262SPaul Mullowney     } catch(char *ex) {
978087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
979087f3262SPaul Mullowney     }
980087f3262SPaul Mullowney   }
981087f3262SPaul Mullowney   PetscFunctionReturn(0);
982087f3262SPaul Mullowney }
983087f3262SPaul Mullowney 
984087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9859ae82921SPaul Mullowney {
9869ae82921SPaul Mullowney   PetscErrorCode               ierr;
987087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
988087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
989087f3262SPaul Mullowney   IS                           ip = a->row;
990087f3262SPaul Mullowney   PetscBool                    perm_identity;
991087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
992087f3262SPaul Mullowney 
993087f3262SPaul Mullowney   PetscFunctionBegin;
994da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
995087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
996da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
997aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
998aa372e3fSPaul Mullowney 
999da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
1000da79fbbcSStefano Zampini 
1001087f3262SPaul Mullowney   /* lower triangular indices */
1002087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1003087f3262SPaul Mullowney   if (!perm_identity) {
10044e4bbfaaSStefano Zampini     IS             iip;
1005da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
10064e4bbfaaSStefano Zampini 
10074e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
10084e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
1009da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
1010aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
1011aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
1012aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
10134e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
10144e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
10154e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
1016087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
1017da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1018da79fbbcSStefano Zampini   }
1019087f3262SPaul Mullowney   PetscFunctionReturn(0);
1020087f3262SPaul Mullowney }
1021087f3262SPaul Mullowney 
1022087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
1023087f3262SPaul Mullowney {
1024087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
1025087f3262SPaul Mullowney   IS             ip = b->row;
1026087f3262SPaul Mullowney   PetscBool      perm_identity;
1027b175d8bbSPaul Mullowney   PetscErrorCode ierr;
1028087f3262SPaul Mullowney 
1029087f3262SPaul Mullowney   PetscFunctionBegin;
103057181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
1031087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
1032ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1033087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
1034087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
1035087f3262SPaul Mullowney   if (perm_identity) {
1036087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
1037087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
10384e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10394e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1040087f3262SPaul Mullowney   } else {
1041087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
1042087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
10434e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
10444e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
1045087f3262SPaul Mullowney   }
1046087f3262SPaul Mullowney 
1047087f3262SPaul Mullowney   /* get the triangular factors */
1048087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
1049087f3262SPaul Mullowney   PetscFunctionReturn(0);
1050087f3262SPaul Mullowney }
10519ae82921SPaul Mullowney 
1052b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1053bda325fcSPaul Mullowney {
1054bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1055aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1056aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1057da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1058da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1059bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1060aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1061aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1062aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1063aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10641b0a6780SStefano Zampini   cudaError_t                       cerr;
1065da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1066b175d8bbSPaul Mullowney 
1067bda325fcSPaul Mullowney   PetscFunctionBegin;
1068aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1069da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1070da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1071aa372e3fSPaul Mullowney 
1072aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1073aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1074aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1075aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1076aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1077aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1078aa372e3fSPaul Mullowney 
1079aa372e3fSPaul Mullowney   /* Create the matrix description */
108057d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
108157d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
108257d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
108357d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
108457d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1085aa372e3fSPaul Mullowney 
1086aa372e3fSPaul Mullowney   /* set the operation */
1087aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1088aa372e3fSPaul Mullowney 
1089aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1090aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1091afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1092afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1093aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1094afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1095afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1096afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1097aa372e3fSPaul Mullowney 
1098aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1099afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1100afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1101afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1102afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1103afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1104afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1105afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1106afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1107afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1108afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
11091b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1110afb2bd1cSJunchao Zhang #endif
1111afb2bd1cSJunchao Zhang 
1112da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1113aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1114aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1115aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1116aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1117aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1118aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1119afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1120afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1121afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1122d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1123afb2bd1cSJunchao Zhang                         #else
1124afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1125d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1126afb2bd1cSJunchao Zhang                         #endif
1127da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1128da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1129aa372e3fSPaul Mullowney 
1130afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1131da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1132afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11331b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1134afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1135afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1136afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1137afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1138afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1139afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1140afb2bd1cSJunchao Zhang #endif
1141afb2bd1cSJunchao Zhang 
1142afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1143aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1144afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1145afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1146d49cd2b7SBarry Smith                            loTriFactorT->csrMat->column_indices->data().get(),
11471b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1148d49cd2b7SBarry Smith                            loTriFactorT->solveInfo,
1149d49cd2b7SBarry Smith                            loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1150d49cd2b7SBarry Smith                           #else
1151d49cd2b7SBarry Smith                            loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1152afb2bd1cSJunchao Zhang                           #endif
1153da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1154da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1155aa372e3fSPaul Mullowney 
1156da79fbbcSStefano Zampini   /* assign the pointer */
1157aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1158aa372e3fSPaul Mullowney 
1159aa372e3fSPaul Mullowney   /*********************************************/
1160aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1161aa372e3fSPaul Mullowney   /*********************************************/
1162aa372e3fSPaul Mullowney 
1163aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1164da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1165da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1166aa372e3fSPaul Mullowney 
1167aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1168aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1169aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1170aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1171aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1172aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1173aa372e3fSPaul Mullowney 
1174aa372e3fSPaul Mullowney   /* Create the matrix description */
117557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
117657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
117757d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
117857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
117957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1180aa372e3fSPaul Mullowney 
1181aa372e3fSPaul Mullowney   /* set the operation */
1182aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1183aa372e3fSPaul Mullowney 
1184aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1185aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1186afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1187afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1188aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1189afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1190afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1191afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1192aa372e3fSPaul Mullowney 
1193aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1194afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1195afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1196afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1197afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1198afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1199afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1200afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1201afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1202afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1203afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1204afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1205afb2bd1cSJunchao Zhang #endif
1206afb2bd1cSJunchao Zhang 
1207da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1208aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1209aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1210aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1211aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1212aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1213aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1214afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1215afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1216afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1217d49cd2b7SBarry Smith                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer);CHKERRCUSPARSE(stat);
1218afb2bd1cSJunchao Zhang                         #else
1219afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1220d49cd2b7SBarry Smith                           CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1221afb2bd1cSJunchao Zhang                         #endif
1222d49cd2b7SBarry Smith 
1223da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1224da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1225aa372e3fSPaul Mullowney 
1226afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1227da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1228afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
12291b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1230afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1231afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1232afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1233afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1234afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1235afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1236afb2bd1cSJunchao Zhang   #endif
1237afb2bd1cSJunchao Zhang 
1238afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1239aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1240afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1241afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1242d49cd2b7SBarry Smith                            upTriFactorT->csrMat->column_indices->data().get(),
12431b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1244d49cd2b7SBarry Smith                            upTriFactorT->solveInfo,
1245d49cd2b7SBarry Smith                            upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1246d49cd2b7SBarry Smith                           #else
1247d49cd2b7SBarry Smith                            upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
1248afb2bd1cSJunchao Zhang                           #endif
1249d49cd2b7SBarry Smith 
1250da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1251da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1252aa372e3fSPaul Mullowney 
1253da79fbbcSStefano Zampini   /* assign the pointer */
1254aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1255bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1256bda325fcSPaul Mullowney }
1257bda325fcSPaul Mullowney 
1258a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1259a49f1ed0SStefano Zampini {
1260a49f1ed0SStefano Zampini   __host__ __device__
1261a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1262a49f1ed0SStefano Zampini   {
1263a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1264a49f1ed0SStefano Zampini   }
1265a49f1ed0SStefano Zampini };
1266a49f1ed0SStefano Zampini 
12673606e59fSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A)
1268bda325fcSPaul Mullowney {
1269aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1270a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1271bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1272bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1273aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1274b06137fdSPaul Mullowney   cudaError_t                  err;
127585ba7357SStefano Zampini   PetscErrorCode               ierr;
1276b175d8bbSPaul Mullowney 
1277bda325fcSPaul Mullowney   PetscFunctionBegin;
1278a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1279a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1280e8d2b73aSMark Adams   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing mat struct");
1281a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1282e8d2b73aSMark Adams   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing matTranspose struct");
12831a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
128485ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1285ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1286a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1287a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1288a49f1ed0SStefano Zampini   }
1289a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1290aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
129157d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1292aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
129357d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
129457d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1295aa372e3fSPaul Mullowney 
1296b06137fdSPaul Mullowney     /* set alpha and beta */
1297afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12987656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12997656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1300afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13017656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
13027656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1303b06137fdSPaul Mullowney 
1304aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1305aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1306a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1307554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1308554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1309aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1310a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1311aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1312aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1313a3fdcf43SKarl Rupp 
1314039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
131581902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1316afb2bd1cSJunchao Zhang 
1317afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
13183606e59fSJunchao Zhang       #if PETSC_PKG_CUDA_VERSION_GE(11,2,1)
1319afb2bd1cSJunchao Zhang         stat = cusparseCreateCsr(&matstructT->matDescr,
1320afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1321afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1322afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1323afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1324afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13253606e59fSJunchao Zhang       #else
13263606e59fSJunchao Zhang         /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1,
13273606e59fSJunchao Zhang            see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1
13283606e59fSJunchao Zhang 
13293606e59fSJunchao Zhang            I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set
13303606e59fSJunchao Zhang            it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2,
13313606e59fSJunchao Zhang            when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly.
13323606e59fSJunchao Zhang         */
13333606e59fSJunchao Zhang         if (matrixT->num_entries) {
13343606e59fSJunchao Zhang           stat = cusparseCreateCsr(&matstructT->matDescr,
13353606e59fSJunchao Zhang                                  matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
13363606e59fSJunchao Zhang                                  matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
13373606e59fSJunchao Zhang                                  matrixT->values->data().get(),
13383606e59fSJunchao Zhang                                  CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I,
13393606e59fSJunchao Zhang                                  indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
13403606e59fSJunchao Zhang 
13413606e59fSJunchao Zhang         } else {
13423606e59fSJunchao Zhang           matstructT->matDescr = NULL;
13433606e59fSJunchao Zhang           matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13443606e59fSJunchao Zhang         }
13453606e59fSJunchao Zhang       #endif
1346afb2bd1cSJunchao Zhang      #endif
1347aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1348afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1349afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1350afb2bd1cSJunchao Zhang    #else
1351aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
135251c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
135351c6d536SStefano Zampini       /* First convert HYB to CSR */
1354aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1355aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1356aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1357aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1358aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1359aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1360aa372e3fSPaul Mullowney 
1361aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1362aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1363aa372e3fSPaul Mullowney                               temp->values->data().get(),
1364aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
136557d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1366aa372e3fSPaul Mullowney 
1367aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1368aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1369aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1370aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1371aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1372aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1373aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1374aa372e3fSPaul Mullowney 
1375aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1376aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1377aa372e3fSPaul Mullowney                               temp->values->data().get(),
1378aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1379aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1380aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1381aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1382aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
138357d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1384aa372e3fSPaul Mullowney 
1385aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1386aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
138757d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1388aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1389aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1390aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1391aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1392aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1393aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
139457d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1395aa372e3fSPaul Mullowney 
1396aa372e3fSPaul Mullowney       /* assign the pointer */
1397aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13981a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1399aa372e3fSPaul Mullowney       /* delete temporaries */
1400aa372e3fSPaul Mullowney       if (tempT) {
1401aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1402aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1403aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1404aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1405087f3262SPaul Mullowney       }
1406aa372e3fSPaul Mullowney       if (temp) {
1407aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1408aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1409aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1410aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1411aa372e3fSPaul Mullowney       }
1412afb2bd1cSJunchao Zhang      #endif
1413aa372e3fSPaul Mullowney     }
1414a49f1ed0SStefano Zampini   }
1415a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1416a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1417a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1418e8d2b73aSMark Adams     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix");
1419e8d2b73aSMark Adams     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix rows");
1420e8d2b73aSMark Adams     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix cols");
1421e8d2b73aSMark Adams     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrix values");
1422e8d2b73aSMark Adams     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT");
1423e8d2b73aSMark Adams     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT rows");
1424e8d2b73aSMark Adams     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT cols");
1425e8d2b73aSMark Adams     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CsrMatrixT values");
1426a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1427a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1428a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1429a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1430a49f1ed0SStefano Zampini     }
1431a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1432a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1433a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1434a49f1ed0SStefano Zampini 
1435a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1436a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1437a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1438a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1439a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1440a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1441a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1442a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1443a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1444a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1445a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1446a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1447a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1448a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1449a49f1ed0SStefano Zampini      #endif
1450a49f1ed0SStefano Zampini 
14511a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
14521a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
14531a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
14541a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
14551a2c6b5cSJunchao Zhang 
14561a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
14571a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
14581a2c6b5cSJunchao Zhang         */
14591a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
14601a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
14611a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
14621a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
14631a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1464a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1465a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1466a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1467a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
14681a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1469a49f1ed0SStefano Zampini                              #else
1470a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
14711a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1472a49f1ed0SStefano Zampini                              #endif
14731a2c6b5cSJunchao Zhang       } else {
14741a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
14751a2c6b5cSJunchao Zhang       }
14761a2c6b5cSJunchao Zhang 
1477a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1478a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1479a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1480a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1481a49f1ed0SStefano Zampini      #endif
1482a49f1ed0SStefano Zampini     }
1483a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1484a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1485a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1486a49f1ed0SStefano Zampini   }
1487ee7b52eaSHong Zhang   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
148885ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1489213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1490213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1491aa372e3fSPaul Mullowney   /* assign the pointer */
1492aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14931a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1494bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1495bda325fcSPaul Mullowney }
1496bda325fcSPaul Mullowney 
1497a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14986fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1499bda325fcSPaul Mullowney {
1500c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1501465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1502465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1503465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1504465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1505bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1506bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1507aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1510b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
1511bda325fcSPaul Mullowney 
1512bda325fcSPaul Mullowney   PetscFunctionBegin;
1513aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1514aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1515bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1516aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1517aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1518bda325fcSPaul Mullowney   }
1519bda325fcSPaul Mullowney 
1520bda325fcSPaul Mullowney   /* Get the GPU pointers */
1521c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1522c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1523c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1524c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1525bda325fcSPaul Mullowney 
15267a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1527aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1528a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1529c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1530c41cb2e2SAlejandro Lamas Daviña                xGPU);
1531aa372e3fSPaul Mullowney 
1532aa372e3fSPaul Mullowney   /* First, solve U */
1533aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1534afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15351b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1536afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1537afb2bd1cSJunchao Zhang                       #endif
1538afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1539aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1540aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1541aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1542aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1543d49cd2b7SBarry Smith                         xarray,
15441b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545d49cd2b7SBarry Smith                         tempGPU->data().get(),
1546d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1547d49cd2b7SBarry Smith                       #else
1548d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1549afb2bd1cSJunchao Zhang                       #endif
1550aa372e3fSPaul Mullowney 
1551aa372e3fSPaul Mullowney   /* Then, solve L */
1552aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1553afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15541b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1555afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1556afb2bd1cSJunchao Zhang                       #endif
1557afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1558aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1559aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1560aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1561aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1562d49cd2b7SBarry Smith                         tempGPU->data().get(),
15631b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1564d49cd2b7SBarry Smith                         xarray,
1565d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1566d49cd2b7SBarry Smith                       #else
1567d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1568afb2bd1cSJunchao Zhang                       #endif
1569aa372e3fSPaul Mullowney 
1570aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1571a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1572c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1573aa372e3fSPaul Mullowney                tempGPU->begin());
1574aa372e3fSPaul Mullowney 
1575aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1576a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1577bda325fcSPaul Mullowney 
1578bda325fcSPaul Mullowney   /* restore */
1579c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1580c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1581661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1582958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1583bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1584bda325fcSPaul Mullowney }
1585bda325fcSPaul Mullowney 
15866fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1587bda325fcSPaul Mullowney {
1588465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1589465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1590bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1591bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1592aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1593aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1594aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1595b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
1596bda325fcSPaul Mullowney 
1597bda325fcSPaul Mullowney   PetscFunctionBegin;
1598aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1599aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1600bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1601aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1602aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1603bda325fcSPaul Mullowney   }
1604bda325fcSPaul Mullowney 
1605bda325fcSPaul Mullowney   /* Get the GPU pointers */
1606c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1607c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1608bda325fcSPaul Mullowney 
16097a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1610aa372e3fSPaul Mullowney   /* First, solve U */
1611aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1612afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
16131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1614afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1615afb2bd1cSJunchao Zhang                       #endif
1616afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1617aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1618aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1619aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1620aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1621d49cd2b7SBarry Smith                         barray,
16221b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1623d49cd2b7SBarry Smith                         tempGPU->data().get(),
1624d49cd2b7SBarry Smith                         upTriFactorT->solvePolicy, upTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1625d49cd2b7SBarry Smith                       #else
1626d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1627afb2bd1cSJunchao Zhang                       #endif
1628aa372e3fSPaul Mullowney 
1629aa372e3fSPaul Mullowney   /* Then, solve L */
1630aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1631afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
16321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1634afb2bd1cSJunchao Zhang                       #endif
1635afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1636aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1637aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1638aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1639aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1640d49cd2b7SBarry Smith                         tempGPU->data().get(),
16411b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1642d49cd2b7SBarry Smith                         xarray,
1643d49cd2b7SBarry Smith                         loTriFactorT->solvePolicy, loTriFactorT->solveBuffer);CHKERRCUSPARSE(stat);
1644d49cd2b7SBarry Smith                       #else
1645d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1646afb2bd1cSJunchao Zhang                       #endif
1647bda325fcSPaul Mullowney 
1648bda325fcSPaul Mullowney   /* restore */
1649c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1650c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1651661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1652958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1653bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1654bda325fcSPaul Mullowney }
1655bda325fcSPaul Mullowney 
16566fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
16579ae82921SPaul Mullowney {
1658465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1659465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1660465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1661465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
16629ae82921SPaul Mullowney   cusparseStatus_t                      stat;
16639ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1664aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1665aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1666aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1667b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
16689ae82921SPaul Mullowney 
16699ae82921SPaul Mullowney   PetscFunctionBegin;
1670ebc8f436SDominic Meiser 
1671e057df02SPaul Mullowney   /* Get the GPU pointers */
1672c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1673c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1674c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1675c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
16769ae82921SPaul Mullowney 
16777a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1678aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1679a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1680c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16814e4bbfaaSStefano Zampini                tempGPU->begin());
1682aa372e3fSPaul Mullowney 
1683aa372e3fSPaul Mullowney   /* Next, solve L */
1684aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1685afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16861b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1687afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1688afb2bd1cSJunchao Zhang                       #endif
1689afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1690aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1691aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1692aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1693aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1694d49cd2b7SBarry Smith                         tempGPU->data().get(),
16951b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1696d49cd2b7SBarry Smith                          xarray,
1697d49cd2b7SBarry Smith                          loTriFactor->solvePolicy, loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1698d49cd2b7SBarry Smith                       #else
1699d49cd2b7SBarry Smith                          xarray);CHKERRCUSPARSE(stat);
1700afb2bd1cSJunchao Zhang                       #endif
1701aa372e3fSPaul Mullowney 
1702aa372e3fSPaul Mullowney   /* Then, solve U */
1703aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1704afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17051b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1706afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1707afb2bd1cSJunchao Zhang                       #endif
1708afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1709aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1710aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1711aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1712d49cd2b7SBarry Smith                         upTriFactor->solveInfo,xarray,
17131b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1714d49cd2b7SBarry Smith                         tempGPU->data().get(),
1715d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1716d49cd2b7SBarry Smith                       #else
1717d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1718afb2bd1cSJunchao Zhang                       #endif
1719d49cd2b7SBarry Smith 
17204e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1721a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
17224e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
17234e4bbfaaSStefano Zampini                xGPU);
17249ae82921SPaul Mullowney 
1725c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1726c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1727661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1728958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17299ae82921SPaul Mullowney   PetscFunctionReturn(0);
17309ae82921SPaul Mullowney }
17319ae82921SPaul Mullowney 
17326fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
17339ae82921SPaul Mullowney {
1734465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1735465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
17369ae82921SPaul Mullowney   cusparseStatus_t                  stat;
17379ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1738aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1739aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1740aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1741b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
17429ae82921SPaul Mullowney 
17439ae82921SPaul Mullowney   PetscFunctionBegin;
1744e057df02SPaul Mullowney   /* Get the GPU pointers */
1745c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1746c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
17479ae82921SPaul Mullowney 
17487a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1749aa372e3fSPaul Mullowney   /* First, solve L */
1750aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1751afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
17521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1753afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1754afb2bd1cSJunchao Zhang                       #endif
1755afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1756aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1757aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1758aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1759aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1760d49cd2b7SBarry Smith                         barray,
17611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1762d49cd2b7SBarry Smith                         tempGPU->data().get(),
1763d49cd2b7SBarry Smith                         loTriFactor->solvePolicy,loTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1764d49cd2b7SBarry Smith                       #else
1765d49cd2b7SBarry Smith                         tempGPU->data().get());CHKERRCUSPARSE(stat);
1766afb2bd1cSJunchao Zhang                       #endif
1767d49cd2b7SBarry Smith 
1768aa372e3fSPaul Mullowney   /* Next, solve U */
1769aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1770afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
17711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1772afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1773afb2bd1cSJunchao Zhang                       #endif
1774afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1775aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1776aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1777aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1778aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1779d49cd2b7SBarry Smith                         tempGPU->data().get(),
17801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1781d49cd2b7SBarry Smith                         xarray,
1782d49cd2b7SBarry Smith                         upTriFactor->solvePolicy, upTriFactor->solveBuffer);CHKERRCUSPARSE(stat);
1783d49cd2b7SBarry Smith                       #else
1784d49cd2b7SBarry Smith                         xarray);CHKERRCUSPARSE(stat);
1785afb2bd1cSJunchao Zhang                       #endif
17869ae82921SPaul Mullowney 
1787c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1788c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
1789661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1790958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17919ae82921SPaul Mullowney   PetscFunctionReturn(0);
17929ae82921SPaul Mullowney }
17939ae82921SPaul Mullowney 
17947e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17957e8381f9SStefano Zampini {
17967e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17977e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17987e8381f9SStefano Zampini   cudaError_t        cerr;
17997e8381f9SStefano Zampini   PetscErrorCode     ierr;
18007e8381f9SStefano Zampini 
18017e8381f9SStefano Zampini   PetscFunctionBegin;
18027e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
18037e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
18047e8381f9SStefano Zampini 
18057e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18067e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
18077e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
18087e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
18097e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
18107e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
18117e8381f9SStefano Zampini   }
18127e8381f9SStefano Zampini   PetscFunctionReturn(0);
18137e8381f9SStefano Zampini }
18147e8381f9SStefano Zampini 
18157e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
18167e8381f9SStefano Zampini {
18177e8381f9SStefano Zampini   PetscErrorCode ierr;
18187e8381f9SStefano Zampini 
18197e8381f9SStefano Zampini   PetscFunctionBegin;
18207e8381f9SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
182167a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
182267a45760SJunchao Zhang   PetscFunctionReturn(0);
182367a45760SJunchao Zhang }
182467a45760SJunchao Zhang 
182567a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
182667a45760SJunchao Zhang {
182767a45760SJunchao Zhang   PetscFunctionBegin;
18287e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
182967a45760SJunchao Zhang   *array         = NULL;
183067a45760SJunchao Zhang   PetscFunctionReturn(0);
183167a45760SJunchao Zhang }
183267a45760SJunchao Zhang 
183367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
183467a45760SJunchao Zhang {
183567a45760SJunchao Zhang   PetscErrorCode ierr;
183667a45760SJunchao Zhang 
183767a45760SJunchao Zhang   PetscFunctionBegin;
183867a45760SJunchao Zhang   ierr   = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
183967a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
184067a45760SJunchao Zhang   PetscFunctionReturn(0);
184167a45760SJunchao Zhang }
184267a45760SJunchao Zhang 
184367a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat A,const PetscScalar *array[])
184467a45760SJunchao Zhang {
184567a45760SJunchao Zhang   PetscFunctionBegin;
184667a45760SJunchao Zhang   *array = NULL;
184767a45760SJunchao Zhang   PetscFunctionReturn(0);
184867a45760SJunchao Zhang }
184967a45760SJunchao Zhang 
185067a45760SJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185167a45760SJunchao Zhang {
185267a45760SJunchao Zhang   PetscFunctionBegin;
185367a45760SJunchao Zhang   *array = ((Mat_SeqAIJ*)A->data)->a;
185467a45760SJunchao Zhang   PetscFunctionReturn(0);
185567a45760SJunchao Zhang }
185667a45760SJunchao Zhang 
185767a45760SJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
185867a45760SJunchao Zhang {
185967a45760SJunchao Zhang   PetscFunctionBegin;
186067a45760SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_CPU;
186167a45760SJunchao Zhang   *array         = NULL;
18627e8381f9SStefano Zampini   PetscFunctionReturn(0);
18637e8381f9SStefano Zampini }
18647e8381f9SStefano Zampini 
1865042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
18669ae82921SPaul Mullowney {
1867aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
18687c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
18699ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1870213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
18719ae82921SPaul Mullowney   PetscErrorCode               ierr;
1872aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1873abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1874b06137fdSPaul Mullowney   cudaError_t                  err;
18759ae82921SPaul Mullowney 
18769ae82921SPaul Mullowney   PetscFunctionBegin;
1877e8d2b73aSMark Adams   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Cannot copy to GPU");
1878c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1879a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1880a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1881afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
188285ba7357SStefano Zampini 
1883e8d2b73aSMark Adams       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR values");
188485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1885afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
188605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
18874863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
188885ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1889a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
189034d6c7a5SJose E. Roman     } else {
1891abb89eb1SStefano Zampini       PetscInt nnz;
189285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
18937c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1894a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
18957c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
189681902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1897a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1898a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
18999ae82921SPaul Mullowney       try {
19009ae82921SPaul Mullowney         if (a->compressedrow.use) {
19019ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
19029ae82921SPaul Mullowney           ii   = a->compressedrow.i;
19039ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
19049ae82921SPaul Mullowney         } else {
1905213423ffSJunchao Zhang           m    = A->rmap->n;
1906213423ffSJunchao Zhang           ii   = a->i;
1907e6e9a74fSStefano Zampini           ridx = NULL;
19089ae82921SPaul Mullowney         }
1909e8d2b73aSMark Adams         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR row data");
1910e8d2b73aSMark Adams         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_GPU,"Missing CSR column data");
1911abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1912abb89eb1SStefano Zampini         else nnz = a->nz;
19139ae82921SPaul Mullowney 
191485ba7357SStefano Zampini         /* create cusparse matrix */
1915abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1916aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
191757d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
191857d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
191957d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
19209ae82921SPaul Mullowney 
1921afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
19227656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
19237656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1924afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19257656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
19267656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
192757d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1928b06137fdSPaul Mullowney 
1929aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1930aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1931aa372e3fSPaul Mullowney           /* set the matrix */
1932afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1933afb2bd1cSJunchao Zhang           mat->num_rows = m;
1934afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1935abb89eb1SStefano Zampini           mat->num_entries = nnz;
1936afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1937afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
19389ae82921SPaul Mullowney 
1939abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1940abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1941aa372e3fSPaul Mullowney 
1942abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1943abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1944aa372e3fSPaul Mullowney 
1945aa372e3fSPaul Mullowney           /* assign the pointer */
1946afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1947afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1948afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1949afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1950afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1951afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1952afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1953afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1954afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1955afb2bd1cSJunchao Zhang           }
1956afb2bd1cSJunchao Zhang          #endif
1957aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1958afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1959afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1960afb2bd1cSJunchao Zhang          #else
1961afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1962afb2bd1cSJunchao Zhang           mat->num_rows = m;
1963afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1964abb89eb1SStefano Zampini           mat->num_entries = nnz;
1965afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1966afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1967aa372e3fSPaul Mullowney 
1968abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1969abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1970aa372e3fSPaul Mullowney 
1971abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1972abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1973aa372e3fSPaul Mullowney 
1974aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
197557d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1976aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1977aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1978afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1979afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1980afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1981afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
198257d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1983aa372e3fSPaul Mullowney           /* assign the pointer */
1984aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1985aa372e3fSPaul Mullowney 
1986afb2bd1cSJunchao Zhang           if (mat) {
1987afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1988afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1989afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1990afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1991087f3262SPaul Mullowney           }
1992afb2bd1cSJunchao Zhang          #endif
1993087f3262SPaul Mullowney         }
1994ca45077fSPaul Mullowney 
1995aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1996213423ffSJunchao Zhang         if (a->compressedrow.use) {
1997213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1998aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1999aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
2000213423ffSJunchao Zhang           tmp = m;
2001213423ffSJunchao Zhang         } else {
2002213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
2003213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
2004213423ffSJunchao Zhang           tmp = 0;
2005213423ffSJunchao Zhang         }
2006213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
2007aa372e3fSPaul Mullowney 
2008aa372e3fSPaul Mullowney         /* assign the pointer */
2009aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
20109ae82921SPaul Mullowney       } catch(char *ex) {
20119ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
20129ae82921SPaul Mullowney       }
201305035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
201485ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
201534d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
201634d6c7a5SJose E. Roman     }
2017abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
20189ae82921SPaul Mullowney   }
20199ae82921SPaul Mullowney   PetscFunctionReturn(0);
20209ae82921SPaul Mullowney }
20219ae82921SPaul Mullowney 
2022c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
2023aa372e3fSPaul Mullowney {
2024aa372e3fSPaul Mullowney   template <typename Tuple>
2025aa372e3fSPaul Mullowney   __host__ __device__
2026aa372e3fSPaul Mullowney   void operator()(Tuple t)
2027aa372e3fSPaul Mullowney   {
2028aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
2029aa372e3fSPaul Mullowney   }
2030aa372e3fSPaul Mullowney };
2031aa372e3fSPaul Mullowney 
20327e8381f9SStefano Zampini struct VecCUDAEquals
20337e8381f9SStefano Zampini {
20347e8381f9SStefano Zampini   template <typename Tuple>
20357e8381f9SStefano Zampini   __host__ __device__
20367e8381f9SStefano Zampini   void operator()(Tuple t)
20377e8381f9SStefano Zampini   {
20387e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
20397e8381f9SStefano Zampini   }
20407e8381f9SStefano Zampini };
20417e8381f9SStefano Zampini 
2042e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
2043e6e9a74fSStefano Zampini {
2044e6e9a74fSStefano Zampini   template <typename Tuple>
2045e6e9a74fSStefano Zampini   __host__ __device__
2046e6e9a74fSStefano Zampini   void operator()(Tuple t)
2047e6e9a74fSStefano Zampini   {
2048e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
2049e6e9a74fSStefano Zampini   }
2050e6e9a74fSStefano Zampini };
2051e6e9a74fSStefano Zampini 
2052afb2bd1cSJunchao Zhang struct MatMatCusparse {
2053ccdfe979SStefano Zampini   PetscBool             cisdense;
2054ccdfe979SStefano Zampini   PetscScalar           *Bt;
2055ccdfe979SStefano Zampini   Mat                   X;
2056fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
2057fcdce8c4SStefano Zampini   PetscLogDouble        flops;
2058fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
2059b4285af6SJunchao Zhang 
2060afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2061fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
2062afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
2063afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
2064afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
2065afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
2066b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2067b4285af6SJunchao Zhang   void                  *dBuffer4;
2068b4285af6SJunchao Zhang   void                  *dBuffer5;
2069b4285af6SJunchao Zhang  #endif
2070fcdce8c4SStefano Zampini   size_t                mmBufferSize;
2071fcdce8c4SStefano Zampini   void                  *mmBuffer;
2072fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
2073fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
2074afb2bd1cSJunchao Zhang #endif
2075afb2bd1cSJunchao Zhang };
2076ccdfe979SStefano Zampini 
2077ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
2078ccdfe979SStefano Zampini {
2079ccdfe979SStefano Zampini   PetscErrorCode   ierr;
2080ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
2081ccdfe979SStefano Zampini   cudaError_t      cerr;
2082fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2083fcdce8c4SStefano Zampini   cusparseStatus_t stat;
2084fcdce8c4SStefano Zampini  #endif
2085ccdfe979SStefano Zampini 
2086ccdfe979SStefano Zampini   PetscFunctionBegin;
2087ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
2088fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
2089afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2090fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
2091afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
2092afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
2093fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
2094b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2095b4285af6SJunchao Zhang   if (mmdata->dBuffer4)  { cerr = cudaFree(mmdata->dBuffer4);CHKERRCUDA(cerr); }
2096b4285af6SJunchao Zhang   if (mmdata->dBuffer5)  { cerr = cudaFree(mmdata->dBuffer5);CHKERRCUDA(cerr); }
2097b4285af6SJunchao Zhang  #endif
2098b4285af6SJunchao Zhang   if (mmdata->mmBuffer)  { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
2099b4285af6SJunchao Zhang   if (mmdata->mmBuffer2) { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
2100afb2bd1cSJunchao Zhang  #endif
2101ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
2102ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
2103ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2104ccdfe979SStefano Zampini }
2105ccdfe979SStefano Zampini 
2106ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
2107ccdfe979SStefano Zampini 
2108ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2109ccdfe979SStefano Zampini {
2110ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2111ccdfe979SStefano Zampini   Mat                          A,B;
2112afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
2113ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
2114ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
2115ccdfe979SStefano Zampini   cusparseStatus_t             stat;
2116ccdfe979SStefano Zampini   cusparseOperation_t          opA;
2117ccdfe979SStefano Zampini   const PetscScalar            *barray;
2118ccdfe979SStefano Zampini   PetscScalar                  *carray;
2119ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2120ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
2121ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
2122ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
2123ccdfe979SStefano Zampini 
2124ccdfe979SStefano Zampini   PetscFunctionBegin;
2125ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2126e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2127ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
2128ccdfe979SStefano Zampini   A    = product->A;
2129ccdfe979SStefano Zampini   B    = product->B;
2130ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2131e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2132ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2133ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2134ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2135ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2136ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2137ccdfe979SStefano Zampini   switch (product->type) {
2138ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2139ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2140ccdfe979SStefano Zampini     mat = cusp->mat;
2141ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2142ccdfe979SStefano Zampini     m   = A->rmap->n;
2143ccdfe979SStefano Zampini     n   = B->cmap->n;
2144ccdfe979SStefano Zampini     break;
2145ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
21461a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2147e6e9a74fSStefano Zampini       mat = cusp->mat;
2148e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2149e6e9a74fSStefano Zampini     } else {
21503606e59fSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2151ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2152ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2153e6e9a74fSStefano Zampini     }
2154ccdfe979SStefano Zampini     m = A->cmap->n;
2155ccdfe979SStefano Zampini     n = B->cmap->n;
2156ccdfe979SStefano Zampini     break;
2157ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2158ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2159ccdfe979SStefano Zampini     mat = cusp->mat;
2160ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2161ccdfe979SStefano Zampini     m   = A->rmap->n;
2162ccdfe979SStefano Zampini     n   = B->rmap->n;
2163ccdfe979SStefano Zampini     break;
2164ccdfe979SStefano Zampini   default:
2165e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2166ccdfe979SStefano Zampini   }
2167e8d2b73aSMark Adams   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2168ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2169ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2170ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2171afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2172ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2173afb2bd1cSJunchao Zhang 
2174ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2175c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2176c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2177c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2178c8378d12SStefano Zampini   } else {
2179c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2180c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2181c8378d12SStefano Zampini   }
2182c8378d12SStefano Zampini 
2183c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2184afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2185afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2186a5b23f4aSJose E. Roman   /* (re)allocate mmBuffer if not initialized or LDAs are different */
2187afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2188fcdce8c4SStefano Zampini     size_t mmBufferSize;
2189afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2190afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2191afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2192afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2193afb2bd1cSJunchao Zhang     }
2194c8378d12SStefano Zampini 
2195afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2196afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2197afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2198afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2199afb2bd1cSJunchao Zhang     }
2200afb2bd1cSJunchao Zhang 
2201afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2202afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2203afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2204afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2205afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2206afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2207afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2208afb2bd1cSJunchao Zhang     }
2209afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2210afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2211afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2212fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2213fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2214ee7b52eaSHong Zhang       cudaError_t cerr;
2215fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2216fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2217fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2218fcdce8c4SStefano Zampini     }
2219afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2220afb2bd1cSJunchao Zhang   } else {
2221afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2222afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2223afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2224afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2225afb2bd1cSJunchao Zhang   }
2226afb2bd1cSJunchao Zhang 
2227afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2228afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2229afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2230afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2231fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2232afb2bd1cSJunchao Zhang  #else
2233afb2bd1cSJunchao Zhang   PetscInt k;
2234afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2235ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2236ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2237ccdfe979SStefano Zampini     cublasStatus_t cerr;
2238ccdfe979SStefano Zampini 
2239ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2240ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2241ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2242ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2243ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2244ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2245ccdfe979SStefano Zampini     blda = B->cmap->n;
2246afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2247afb2bd1cSJunchao Zhang   } else {
2248afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2249ccdfe979SStefano Zampini   }
2250ccdfe979SStefano Zampini 
2251afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2252ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2253afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2254ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2255ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2256ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2257ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2258ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2259afb2bd1cSJunchao Zhang  #endif
2260c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2261c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2262ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2263ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2264ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2265ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2266ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2267ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2268ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2269ccdfe979SStefano Zampini   } else {
2270ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2271ccdfe979SStefano Zampini   }
2272ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2273ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2274ccdfe979SStefano Zampini   }
2275ccdfe979SStefano Zampini   if (!biscuda) {
2276ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2277ccdfe979SStefano Zampini   }
2278ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2279ccdfe979SStefano Zampini }
2280ccdfe979SStefano Zampini 
2281ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2282ccdfe979SStefano Zampini {
2283ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2284ccdfe979SStefano Zampini   Mat                A,B;
2285ccdfe979SStefano Zampini   PetscInt           m,n;
2286ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2287ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2288ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2289ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2290ccdfe979SStefano Zampini 
2291ccdfe979SStefano Zampini   PetscFunctionBegin;
2292ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2293e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2294ccdfe979SStefano Zampini   A    = product->A;
2295ccdfe979SStefano Zampini   B    = product->B;
2296ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2297e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2298ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2299e8d2b73aSMark Adams   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2300ccdfe979SStefano Zampini   switch (product->type) {
2301ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2302ccdfe979SStefano Zampini     m = A->rmap->n;
2303ccdfe979SStefano Zampini     n = B->cmap->n;
2304ccdfe979SStefano Zampini     break;
2305ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2306ccdfe979SStefano Zampini     m = A->cmap->n;
2307ccdfe979SStefano Zampini     n = B->cmap->n;
2308ccdfe979SStefano Zampini     break;
2309ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2310ccdfe979SStefano Zampini     m = A->rmap->n;
2311ccdfe979SStefano Zampini     n = B->rmap->n;
2312ccdfe979SStefano Zampini     break;
2313ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2314ccdfe979SStefano Zampini     m = B->cmap->n;
2315ccdfe979SStefano Zampini     n = B->cmap->n;
2316ccdfe979SStefano Zampini     break;
2317ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2318ccdfe979SStefano Zampini     m = B->rmap->n;
2319ccdfe979SStefano Zampini     n = B->rmap->n;
2320ccdfe979SStefano Zampini     break;
2321ccdfe979SStefano Zampini   default:
2322e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2323ccdfe979SStefano Zampini   }
2324ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2325ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2326ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2327ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2328ccdfe979SStefano Zampini 
2329ccdfe979SStefano Zampini   /* product data */
2330ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2331ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2332afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2333afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2334ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2335afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2336ccdfe979SStefano Zampini   }
2337afb2bd1cSJunchao Zhang  #endif
2338ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2339ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2340ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2341ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2342ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2343ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2344ccdfe979SStefano Zampini     } else {
2345ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2346ccdfe979SStefano Zampini     }
2347ccdfe979SStefano Zampini   }
2348ccdfe979SStefano Zampini   C->product->data    = mmdata;
2349ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2350ccdfe979SStefano Zampini 
2351ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2352ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2353ccdfe979SStefano Zampini }
2354ccdfe979SStefano Zampini 
2355fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2356ccdfe979SStefano Zampini {
2357ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2358fcdce8c4SStefano Zampini   Mat                          A,B;
2359fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2360fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2361fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2362fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2363fcdce8c4SStefano Zampini   PetscBool                    flg;
2364ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2365fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2366fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2367fcdce8c4SStefano Zampini   MatProductType               ptype;
2368fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2369fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2370fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2371fcdce8c4SStefano Zampini #endif
2372b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2373ccdfe979SStefano Zampini 
2374ccdfe979SStefano Zampini   PetscFunctionBegin;
2375ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2376e8d2b73aSMark Adams   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data empty");
2377fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2378e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for C of type %s",((PetscObject)C)->type_name);
2379fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2380fcdce8c4SStefano Zampini   A = product->A;
2381fcdce8c4SStefano Zampini   B = product->B;
2382fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2383fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2384fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2385e8d2b73aSMark Adams     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2386fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2387e8d2b73aSMark Adams     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2388fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2389e8d2b73aSMark Adams     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2390fcdce8c4SStefano Zampini     goto finalize;
2391fcdce8c4SStefano Zampini   }
2392fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2393fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2394e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2395fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2396e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2397fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2398fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2399fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2400fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2401fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2402e8d2b73aSMark Adams   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2403e8d2b73aSMark Adams   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2404e8d2b73aSMark Adams   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2405fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2406fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2407fcdce8c4SStefano Zampini 
2408fcdce8c4SStefano Zampini   ptype = product->type;
2409fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2410fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2411fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_A_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that A is symmetric");
2412fa046f9fSJunchao Zhang   }
2413fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2414fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2415fa046f9fSJunchao Zhang     if (!product->symbolic_used_the_fact_B_is_symmetric) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Symbolic should have been built using the fact that B is symmetric");
2416fa046f9fSJunchao Zhang   }
2417fcdce8c4SStefano Zampini   switch (ptype) {
2418fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2419fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2420fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2421fcdce8c4SStefano Zampini     break;
2422fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2423fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2424fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2425fcdce8c4SStefano Zampini     break;
2426fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2427fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2428fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2429fcdce8c4SStefano Zampini     break;
2430fcdce8c4SStefano Zampini   default:
2431e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2432fcdce8c4SStefano Zampini   }
2433fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2434e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2435e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2436e8d2b73aSMark Adams   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2437fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2438fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2439fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2440e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2441e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2442e8d2b73aSMark Adams   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing C CSR struct");
2443fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2444fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2445fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2446b4285af6SJunchao Zhang   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2447b4285af6SJunchao Zhang   #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2448b4285af6SJunchao Zhang     stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2449b4285af6SJunchao Zhang                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2450b4285af6SJunchao Zhang                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2451b4285af6SJunchao Zhang                                mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2452b4285af6SJunchao Zhang   #else
2453b4285af6SJunchao Zhang     stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2454fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2455fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2456fcdce8c4SStefano Zampini                                mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2457b4285af6SJunchao Zhang     stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2458fcdce8c4SStefano Zampini                                Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2459fcdce8c4SStefano Zampini                                cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2460b4285af6SJunchao Zhang   #endif
2461fcdce8c4SStefano Zampini #else
2462b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2463fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2464fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2465fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2466fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2467fcdce8c4SStefano Zampini #endif
2468fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2469fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2470fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2471fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2472fcdce8c4SStefano Zampini finalize:
2473fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2474c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2475fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2476c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr);
2477fcdce8c4SStefano Zampini   c->reallocs         = 0;
2478fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2479fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2480fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2481fcdce8c4SStefano Zampini   C->num_ass++;
2482ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2483ccdfe979SStefano Zampini }
2484fcdce8c4SStefano Zampini 
2485fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2486fcdce8c4SStefano Zampini {
2487fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2488fcdce8c4SStefano Zampini   Mat                          A,B;
2489fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2490fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2491fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2492fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2493fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2494fcdce8c4SStefano Zampini   PetscBool                    flg;
2495fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2496fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2497fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2498fcdce8c4SStefano Zampini   MatProductType               ptype;
2499fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2500fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2501fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2502fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2503fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2504fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2505fcdce8c4SStefano Zampini #else
2506fcdce8c4SStefano Zampini   int                          cnz;
2507fcdce8c4SStefano Zampini #endif
2508b4285af6SJunchao Zhang   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE,opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */
2509fcdce8c4SStefano Zampini 
2510fcdce8c4SStefano Zampini   PetscFunctionBegin;
2511fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2512e8d2b73aSMark Adams   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Product data not empty");
2513fcdce8c4SStefano Zampini   A    = product->A;
2514fcdce8c4SStefano Zampini   B    = product->B;
2515fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2516e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for type %s",((PetscObject)A)->type_name);
2517fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2518e8d2b73aSMark Adams   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Not for B of type %s",((PetscObject)B)->type_name);
2519fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2520fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2521fcdce8c4SStefano Zampini   /* product data */
2522fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2523fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2524fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2525fcdce8c4SStefano Zampini 
2526fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2527fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2528d60bce21SJunchao Zhang   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */
2529d60bce21SJunchao Zhang   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2530d60bce21SJunchao Zhang   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2531d60bce21SJunchao Zhang   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Only for MAT_CUSPARSE_CSR format");
2532d60bce21SJunchao Zhang 
2533fcdce8c4SStefano Zampini   ptype = product->type;
2534fa046f9fSJunchao Zhang   if (A->symmetric && ptype == MATPRODUCT_AtB) {
2535fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2536fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
2537fa046f9fSJunchao Zhang   }
2538fa046f9fSJunchao Zhang   if (B->symmetric && ptype == MATPRODUCT_ABt) {
2539fa046f9fSJunchao Zhang     ptype = MATPRODUCT_AB;
2540fa046f9fSJunchao Zhang     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
2541fa046f9fSJunchao Zhang   }
2542fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2543fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2544fcdce8c4SStefano Zampini   switch (ptype) {
2545fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2546fcdce8c4SStefano Zampini     m = A->rmap->n;
2547fcdce8c4SStefano Zampini     n = B->cmap->n;
2548fcdce8c4SStefano Zampini     k = A->cmap->n;
2549fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2550fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2551fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2552fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2553fcdce8c4SStefano Zampini     break;
2554fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2555fcdce8c4SStefano Zampini     m = A->cmap->n;
2556fcdce8c4SStefano Zampini     n = B->cmap->n;
2557fcdce8c4SStefano Zampini     k = A->rmap->n;
25583606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
2559fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2560fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2561fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2562fcdce8c4SStefano Zampini     break;
2563fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2564fcdce8c4SStefano Zampini     m = A->rmap->n;
2565fcdce8c4SStefano Zampini     n = B->rmap->n;
2566fcdce8c4SStefano Zampini     k = A->cmap->n;
25673606e59fSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
2568fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2569fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2570fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2571fcdce8c4SStefano Zampini     break;
2572fcdce8c4SStefano Zampini   default:
2573e8d2b73aSMark Adams     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Unsupported product type %s",MatProductTypes[product->type]);
2574fcdce8c4SStefano Zampini   }
2575fcdce8c4SStefano Zampini 
2576fcdce8c4SStefano Zampini   /* create cusparse matrix */
2577fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2578fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2579fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2580fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2581fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2582fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2583fcdce8c4SStefano Zampini 
2584fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2585fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2586fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2587fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2588fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2589fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2590fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2591fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2592fcdce8c4SStefano Zampini   } else {
2593fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2594fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2595fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2596fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2597fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2598fcdce8c4SStefano Zampini   }
2599fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2600fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2601fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2602fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2603fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2604fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2605fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2606fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2607fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2608fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2609fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2610fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2611fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2612fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2613fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2614fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2615fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2616fcdce8c4SStefano Zampini     c->nz = 0;
2617fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2618fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2619fcdce8c4SStefano Zampini     goto finalizesym;
2620fcdce8c4SStefano Zampini   }
2621fcdce8c4SStefano Zampini 
2622e8d2b73aSMark Adams   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2623e8d2b73aSMark Adams   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2624fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2625fcdce8c4SStefano Zampini   if (!biscompressed) {
2626fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2627fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2628fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2629fcdce8c4SStefano Zampini #endif
2630fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2631fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2632fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2633fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2634fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2635fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2636fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2637fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2638fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2639fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2640fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2641fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2642fcdce8c4SStefano Zampini     }
2643fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2644fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2645fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2646fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2647fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2648fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2649fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2650fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2651fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2652fcdce8c4SStefano Zampini     }
2653fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2654fcdce8c4SStefano Zampini #endif
2655fcdce8c4SStefano Zampini   }
2656e8d2b73aSMark Adams   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing A CSR struct");
2657e8d2b73aSMark Adams   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_GPU,"Missing B CSR struct");
2658fcdce8c4SStefano Zampini   /* precompute flops count */
2659fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2660fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2661fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2662fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2663fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2664fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2665fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2666fcdce8c4SStefano Zampini       }
2667fcdce8c4SStefano Zampini     }
2668fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2669fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2670fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2671fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2672fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2673fcdce8c4SStefano Zampini     }
2674fcdce8c4SStefano Zampini   } else { /* TODO */
2675fcdce8c4SStefano Zampini     flops = 0.;
2676fcdce8c4SStefano Zampini   }
2677fcdce8c4SStefano Zampini 
2678fcdce8c4SStefano Zampini   mmdata->flops = flops;
2679fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2680b4285af6SJunchao Zhang 
2681fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2682fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2683fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2684fcdce8c4SStefano Zampini                           NULL, NULL, NULL,
2685fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2686fcdce8c4SStefano Zampini                           CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2687fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2688b4285af6SJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2689b4285af6SJunchao Zhang  {
2690b4285af6SJunchao Zhang   /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it.
2691b4285af6SJunchao Zhang      We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse
2692b4285af6SJunchao Zhang   */
2693b4285af6SJunchao Zhang   void*  dBuffer1 = NULL;
2694b4285af6SJunchao Zhang   void*  dBuffer2 = NULL;
2695b4285af6SJunchao Zhang   void*  dBuffer3 = NULL;
2696b4285af6SJunchao Zhang   /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */
2697b4285af6SJunchao Zhang   size_t bufferSize1 = 0;
2698b4285af6SJunchao Zhang   size_t bufferSize2 = 0;
2699b4285af6SJunchao Zhang   size_t bufferSize3 = 0;
2700b4285af6SJunchao Zhang   size_t bufferSize4 = 0;
2701b4285af6SJunchao Zhang   size_t bufferSize5 = 0;
2702b4285af6SJunchao Zhang 
2703b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2704b4285af6SJunchao Zhang   /* ask bufferSize1 bytes for external memory */
2705b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2706b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2707b4285af6SJunchao Zhang                                             &bufferSize1, NULL);CHKERRCUSPARSE(stat);
2708b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer1, bufferSize1);CHKERRCUDA(cerr);
2709b4285af6SJunchao Zhang   /* inspect the matrices A and B to understand the memory requirement for the next step */
2710b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2711b4285af6SJunchao Zhang                                             CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2712b4285af6SJunchao Zhang                                             &bufferSize1, dBuffer1);CHKERRCUSPARSE(stat);
2713b4285af6SJunchao Zhang 
2714b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2715b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2716b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2717b4285af6SJunchao Zhang                                  &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL);CHKERRCUSPARSE(stat);
2718b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer2, bufferSize2);CHKERRCUDA(cerr);
2719b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &dBuffer3, bufferSize3);CHKERRCUDA(cerr);
2720b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer4, bufferSize4);CHKERRCUDA(cerr);
2721b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2722b4285af6SJunchao Zhang                                  CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2723b4285af6SJunchao Zhang                                  &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4);CHKERRCUSPARSE(stat);
2724b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer1);CHKERRCUDA(cerr);
2725b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer2);CHKERRCUDA(cerr);
2726b4285af6SJunchao Zhang 
2727b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2728b4285af6SJunchao Zhang   /* get matrix C non-zero entries C_nnz1 */
2729b4285af6SJunchao Zhang   stat  = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2730b4285af6SJunchao Zhang   c->nz = (PetscInt) C_nnz1;
2731b4285af6SJunchao Zhang   /* allocate matrix C */
2732b4285af6SJunchao Zhang   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2733b4285af6SJunchao Zhang   Ccsr->values         = new THRUSTARRAY(c->nz);CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2734b4285af6SJunchao Zhang   /* update matC with the new pointers */
2735b4285af6SJunchao Zhang   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2736b4285af6SJunchao Zhang                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2737b4285af6SJunchao Zhang 
2738b4285af6SJunchao Zhang   /*----------------------------------------------------------------------*/
2739b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2740b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2741b4285af6SJunchao Zhang                                   &bufferSize5, NULL);CHKERRCUSPARSE(stat);
2742b4285af6SJunchao Zhang   cerr = cudaMalloc((void**) &mmdata->dBuffer5, bufferSize5);CHKERRCUDA(cerr);
2743b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr,
2744b4285af6SJunchao Zhang                                   CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc,
2745b4285af6SJunchao Zhang                                   &bufferSize5, mmdata->dBuffer5);CHKERRCUSPARSE(stat);
2746b4285af6SJunchao Zhang   cerr = cudaFree(dBuffer3);CHKERRCUDA(cerr);
2747b4285af6SJunchao Zhang   stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB,
2748b4285af6SJunchao Zhang                                      Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2749b4285af6SJunchao Zhang                                      cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2750b4285af6SJunchao Zhang                                      mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2751c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufferSize4/1024,bufferSize5/1024);CHKERRQ(ierr);
2752b4285af6SJunchao Zhang  }
2753ae37ee31SJunchao Zhang  #else
2754b4285af6SJunchao Zhang   size_t bufSize2;
2755fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2756b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2757fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2758fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2759fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2760bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2761fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2762b4285af6SJunchao Zhang   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB,
2763fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2764fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2765fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2766fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2767b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2768fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2769fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2770fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2771fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2772fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2773fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2774fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2775fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2776bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2777fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2778b4285af6SJunchao Zhang   stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB,
2779fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2780fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2781fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2782fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2783fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2784fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
2785c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2786fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2787fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2788fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2789fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2790fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2791fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2792b4285af6SJunchao Zhang   stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB,
2793fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2794fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2795ae37ee31SJunchao Zhang  #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0)
2796fcdce8c4SStefano Zampini #else
2797fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2798b4285af6SJunchao Zhang   stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB,
2799fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2800fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2801fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2802fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2803fcdce8c4SStefano Zampini   c->nz = cnz;
2804fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2805fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2806fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2807fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2808fcdce8c4SStefano Zampini 
2809fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2810fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2811fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2812fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2813b4285af6SJunchao Zhang   stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB,
2814fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2815fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2816fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2817fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2818fcdce8c4SStefano Zampini #endif
2819fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2820fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2821fcdce8c4SStefano Zampini finalizesym:
2822fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2823fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2824fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2825fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2826fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2827fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2828fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2829fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2830fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2831fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2832fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2833fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2834fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2835fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2836fcdce8c4SStefano Zampini   } else {
2837fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2838fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2839fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2840fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2841fcdce8c4SStefano Zampini   }
2842fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2843fcdce8c4SStefano Zampini     PetscInt r = 0;
2844fcdce8c4SStefano Zampini     c->i[0] = 0;
2845fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2846fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2847fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2848fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2849fcdce8c4SStefano Zampini     }
2850fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2851fcdce8c4SStefano Zampini   }
2852fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2853fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2854fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2855fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2856fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2857fcdce8c4SStefano Zampini   c->rmax = 0;
2858fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2859fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2860fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2861fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2862fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2863fcdce8c4SStefano Zampini   }
2864fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2865fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2866fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2867fcdce8c4SStefano Zampini 
2868fcdce8c4SStefano Zampini   C->nonzerostate++;
2869fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2870fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2871fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2872fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2873fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2874fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2875fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2876abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2877fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2878fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2879fcdce8c4SStefano Zampini   }
2880fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2881fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2882fcdce8c4SStefano Zampini }
2883fcdce8c4SStefano Zampini 
2884fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2885fcdce8c4SStefano Zampini 
2886fcdce8c4SStefano Zampini /* handles sparse or dense B */
2887fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2888fcdce8c4SStefano Zampini {
2889fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2890fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2891fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2892fcdce8c4SStefano Zampini 
2893fcdce8c4SStefano Zampini   PetscFunctionBegin;
2894fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2895fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2896abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2897fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2898fcdce8c4SStefano Zampini   }
2899fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2900fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2901fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2902fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2903fcdce8c4SStefano Zampini     }
2904fcdce8c4SStefano Zampini   }
290565e4b4d4SStefano Zampini   if (Biscusp && Ciscusp) { /* we can always select the CPU backend */
290665e4b4d4SStefano Zampini     PetscBool usecpu = PETSC_FALSE;
290765e4b4d4SStefano Zampini     switch (product->type) {
290865e4b4d4SStefano Zampini     case MATPRODUCT_AB:
290965e4b4d4SStefano Zampini       if (product->api_user) {
291065e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat");CHKERRQ(ierr);
291165e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291265e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291365e4b4d4SStefano Zampini       } else {
291465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat");CHKERRQ(ierr);
291565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ab_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
291665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
291765e4b4d4SStefano Zampini       }
291865e4b4d4SStefano Zampini       break;
291965e4b4d4SStefano Zampini     case MATPRODUCT_AtB:
292065e4b4d4SStefano Zampini       if (product->api_user) {
292165e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat");CHKERRQ(ierr);
292265e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292365e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292465e4b4d4SStefano Zampini       } else {
292565e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat");CHKERRQ(ierr);
292665e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_atb_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
292765e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
292865e4b4d4SStefano Zampini       }
292965e4b4d4SStefano Zampini       break;
293065e4b4d4SStefano Zampini     case MATPRODUCT_PtAP:
293165e4b4d4SStefano Zampini       if (product->api_user) {
293265e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat");CHKERRQ(ierr);
293365e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293465e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293565e4b4d4SStefano Zampini       } else {
293665e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat");CHKERRQ(ierr);
293765e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_ptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL);CHKERRQ(ierr);
293865e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
293965e4b4d4SStefano Zampini       }
294065e4b4d4SStefano Zampini       break;
294165e4b4d4SStefano Zampini     case MATPRODUCT_RARt:
294265e4b4d4SStefano Zampini       if (product->api_user) {
294365e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatRARt","Mat");CHKERRQ(ierr);
294465e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matrart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294565e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
294665e4b4d4SStefano Zampini       } else {
294765e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_RARt","Mat");CHKERRQ(ierr);
294865e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_rart_backend_cpu","Use CPU code","MatRARt",usecpu,&usecpu,NULL);CHKERRQ(ierr);
294965e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295065e4b4d4SStefano Zampini       }
295165e4b4d4SStefano Zampini       break;
295265e4b4d4SStefano Zampini     case MATPRODUCT_ABC:
295365e4b4d4SStefano Zampini       if (product->api_user) {
295465e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMatMult","Mat");CHKERRQ(ierr);
295565e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matmatmatmult_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
295665e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
295765e4b4d4SStefano Zampini       } else {
295865e4b4d4SStefano Zampini         ierr = PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_ABC","Mat");CHKERRQ(ierr);
295965e4b4d4SStefano Zampini         ierr = PetscOptionsBool("-matproduct_abc_backend_cpu","Use CPU code","MatMatMatMult",usecpu,&usecpu,NULL);CHKERRQ(ierr);
296065e4b4d4SStefano Zampini         ierr = PetscOptionsEnd();CHKERRQ(ierr);
296165e4b4d4SStefano Zampini       }
296265e4b4d4SStefano Zampini       break;
296365e4b4d4SStefano Zampini     default:
296465e4b4d4SStefano Zampini       break;
296565e4b4d4SStefano Zampini     }
296665e4b4d4SStefano Zampini     if (usecpu) Biscusp = Ciscusp = PETSC_FALSE;
296765e4b4d4SStefano Zampini   }
296865e4b4d4SStefano Zampini   /* dispatch */
2969fcdce8c4SStefano Zampini   if (isdense) {
2970ccdfe979SStefano Zampini     switch (product->type) {
2971ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2972ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2973ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2974ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2975ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2976fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2977fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2978fcdce8c4SStefano Zampini       } else {
2979fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2980fcdce8c4SStefano Zampini       }
2981fcdce8c4SStefano Zampini       break;
2982fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2983fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2984fcdce8c4SStefano Zampini       break;
2985ccdfe979SStefano Zampini     default:
2986ccdfe979SStefano Zampini       break;
2987ccdfe979SStefano Zampini     }
2988fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2989fcdce8c4SStefano Zampini     switch (product->type) {
2990fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2991fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2992fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2993fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2994fcdce8c4SStefano Zampini       break;
2995fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2996fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2997fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2998fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2999fcdce8c4SStefano Zampini       break;
3000fcdce8c4SStefano Zampini     default:
3001fcdce8c4SStefano Zampini       break;
3002fcdce8c4SStefano Zampini     }
3003fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
3004fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
3005fcdce8c4SStefano Zampini   }
3006ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3007ccdfe979SStefano Zampini }
3008ccdfe979SStefano Zampini 
30096fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
30109ae82921SPaul Mullowney {
3011b175d8bbSPaul Mullowney   PetscErrorCode ierr;
30129ae82921SPaul Mullowney 
30139ae82921SPaul Mullowney   PetscFunctionBegin;
3014e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3015e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3016e6e9a74fSStefano Zampini }
3017e6e9a74fSStefano Zampini 
3018e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
3019e6e9a74fSStefano Zampini {
3020e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3021e6e9a74fSStefano Zampini 
3022e6e9a74fSStefano Zampini   PetscFunctionBegin;
3023e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
3024e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3025e6e9a74fSStefano Zampini }
3026e6e9a74fSStefano Zampini 
3027e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3028e6e9a74fSStefano Zampini {
3029e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3030e6e9a74fSStefano Zampini 
3031e6e9a74fSStefano Zampini   PetscFunctionBegin;
3032e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
3033e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
3034e6e9a74fSStefano Zampini }
3035e6e9a74fSStefano Zampini 
3036e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3037e6e9a74fSStefano Zampini {
3038e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3039e6e9a74fSStefano Zampini 
3040e6e9a74fSStefano Zampini   PetscFunctionBegin;
3041e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
30429ae82921SPaul Mullowney   PetscFunctionReturn(0);
30439ae82921SPaul Mullowney }
30449ae82921SPaul Mullowney 
30456fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3046ca45077fSPaul Mullowney {
3047b175d8bbSPaul Mullowney   PetscErrorCode ierr;
3048ca45077fSPaul Mullowney 
3049ca45077fSPaul Mullowney   PetscFunctionBegin;
3050e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3051ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3052ca45077fSPaul Mullowney }
3053ca45077fSPaul Mullowney 
3054a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
3055a0e72f99SJunchao Zhang {
3056a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
3057a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
3058a0e72f99SJunchao Zhang }
3059a0e72f99SJunchao Zhang 
3060afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
3061e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
30629ae82921SPaul Mullowney {
30639ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
3064aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
30659ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
3066e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
3067b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
3068aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
3069e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
3070e6e9a74fSStefano Zampini   PetscBool                    compressed;
3071afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3072afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
3073afb2bd1cSJunchao Zhang #endif
30746e111a19SKarl Rupp 
30759ae82921SPaul Mullowney   PetscFunctionBegin;
3076e8d2b73aSMark Adams   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"Hermitian and not transpose not supported");
3077e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
3078afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
3079d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
3080e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
3081e6e9a74fSStefano Zampini   }
308234d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
308334d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3084e6e9a74fSStefano Zampini   if (!trans) {
30859ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3086e8d2b73aSMark Adams     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_GPU,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
3087e6e9a74fSStefano Zampini   } else {
30881a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
3089e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
3090e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
3091e6e9a74fSStefano Zampini     } else {
30923606e59fSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);}
3093e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
3094e6e9a74fSStefano Zampini     }
3095e6e9a74fSStefano Zampini   }
3096e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
3097e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
3098213423ffSJunchao Zhang 
3099e6e9a74fSStefano Zampini   try {
3100e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3101213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
3102213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
3103afb2bd1cSJunchao Zhang 
310485ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3105e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3106afb2bd1cSJunchao Zhang       /* z = A x + beta y.
3107afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
3108afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
3109afb2bd1cSJunchao Zhang       */
3110e6e9a74fSStefano Zampini       xptr = xarray;
3111afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
3112213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
3113afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
3115afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
3116afb2bd1cSJunchao Zhang        */
3117afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3118afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3119afb2bd1cSJunchao Zhang         nx = mat->num_cols;
3120afb2bd1cSJunchao Zhang         ny = mat->num_rows;
3121afb2bd1cSJunchao Zhang       }
3122afb2bd1cSJunchao Zhang      #endif
3123e6e9a74fSStefano Zampini     } else {
3124afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
3125afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
3126afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
3127afb2bd1cSJunchao Zhang        */
3128afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
3129e6e9a74fSStefano Zampini       dptr = zarray;
3130e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
3131afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
3132e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
3133a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
3134e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3135e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
3136e6e9a74fSStefano Zampini       }
3137afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3138afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3139afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3140afb2bd1cSJunchao Zhang         nx = mat->num_rows;
3141afb2bd1cSJunchao Zhang         ny = mat->num_cols;
3142afb2bd1cSJunchao Zhang       }
3143afb2bd1cSJunchao Zhang      #endif
3144e6e9a74fSStefano Zampini     }
31459ae82921SPaul Mullowney 
3146afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
3147aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
3148afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3149afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
3150afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
3151ee7b52eaSHong Zhang         cudaError_t cerr;
3152afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3153afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
3154afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
3155afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
3156afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
3157afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
3158afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
3159afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
3160afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
3161afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
3162afb2bd1cSJunchao Zhang 
3163afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
3164afb2bd1cSJunchao Zhang       } else {
3165afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
3166afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
3167afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
3168afb2bd1cSJunchao Zhang       }
3169afb2bd1cSJunchao Zhang 
3170afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
3171afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
31723606e59fSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTranspose() */
3173afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
3174afb2bd1cSJunchao Zhang                                beta,
3175afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
3176afb2bd1cSJunchao Zhang                                cusparse_scalartype,
3177afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
3178afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
3179afb2bd1cSJunchao Zhang      #else
31807656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
3181e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
3182a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
3183afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
3184aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
3185e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
318657d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
3187afb2bd1cSJunchao Zhang      #endif
3188aa372e3fSPaul Mullowney     } else {
3189213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
3190afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3191afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3192afb2bd1cSJunchao Zhang        #else
3193301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
3194e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
3195afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
3196e6e9a74fSStefano Zampini                                  xptr, beta,
319757d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
3198afb2bd1cSJunchao Zhang        #endif
3199a65300a6SPaul Mullowney       }
3200aa372e3fSPaul Mullowney     }
3201958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3202aa372e3fSPaul Mullowney 
3203e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
3204213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
3205213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
3206213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
3207e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
3208213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
32097656d835SStefano Zampini         }
3210213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
3211c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
32127656d835SStefano Zampini       }
32137656d835SStefano Zampini 
3214213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
3215213423ffSJunchao Zhang       if (compressed) {
3216e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3217a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
3218a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
3219a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
3220a0e72f99SJunchao Zhang          */
3221a0e72f99SJunchao Zhang        #if 0
3222a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
3223a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
3224a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
3225e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
3226c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
3227a0e72f99SJunchao Zhang        #else
3228a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
3229a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
3230a0e72f99SJunchao Zhang        #endif
3231958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3232e6e9a74fSStefano Zampini       }
3233e6e9a74fSStefano Zampini     } else {
3234e6e9a74fSStefano Zampini       if (yy && yy != zz) {
3235e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
3236e6e9a74fSStefano Zampini       }
3237e6e9a74fSStefano Zampini     }
3238e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
3239213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
3240213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
32419ae82921SPaul Mullowney   } catch(char *ex) {
32429ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
32439ae82921SPaul Mullowney   }
3244e6e9a74fSStefano Zampini   if (yy) {
3245958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
3246e6e9a74fSStefano Zampini   } else {
3247e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
3248e6e9a74fSStefano Zampini   }
32499ae82921SPaul Mullowney   PetscFunctionReturn(0);
32509ae82921SPaul Mullowney }
32519ae82921SPaul Mullowney 
32526fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
3253ca45077fSPaul Mullowney {
3254b175d8bbSPaul Mullowney   PetscErrorCode ierr;
32556e111a19SKarl Rupp 
3256ca45077fSPaul Mullowney   PetscFunctionBegin;
3257e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
3258ca45077fSPaul Mullowney   PetscFunctionReturn(0);
3259ca45077fSPaul Mullowney }
3260ca45077fSPaul Mullowney 
32616fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
32629ae82921SPaul Mullowney {
32639ae82921SPaul Mullowney   PetscErrorCode     ierr;
3264042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
3265042217e8SBarry Smith   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
32663fa6b06aSMark Adams 
3267042217e8SBarry Smith   PetscFunctionBegin;
3268042217e8SBarry Smith   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
3269042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
3270042217e8SBarry Smith     cudaError_t cerr;
3271042217e8SBarry Smith 
3272042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
3273042217e8SBarry Smith     cerr = cudaFree(cusp->deviceMat);CHKERRCUDA(cerr);
3274042217e8SBarry Smith     cusp->deviceMat = NULL;
3275042217e8SBarry Smith   }
32769ae82921SPaul Mullowney   PetscFunctionReturn(0);
32779ae82921SPaul Mullowney }
32789ae82921SPaul Mullowney 
32799ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
3280e057df02SPaul Mullowney /*@
32819ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3282e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3283e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3284e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3285e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3286e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
32879ae82921SPaul Mullowney 
3288d083f849SBarry Smith    Collective
32899ae82921SPaul Mullowney 
32909ae82921SPaul Mullowney    Input Parameters:
32919ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
32929ae82921SPaul Mullowney .  m - number of rows
32939ae82921SPaul Mullowney .  n - number of columns
32949ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
32959ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
32960298fd71SBarry Smith          (possibly different for each row) or NULL
32979ae82921SPaul Mullowney 
32989ae82921SPaul Mullowney    Output Parameter:
32999ae82921SPaul Mullowney .  A - the matrix
33009ae82921SPaul Mullowney 
33019ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
33029ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
33039ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
33049ae82921SPaul Mullowney 
33059ae82921SPaul Mullowney    Notes:
33069ae82921SPaul Mullowney    If nnz is given then nz is ignored
33079ae82921SPaul Mullowney 
33089ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
33099ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
33109ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
33119ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
33129ae82921SPaul Mullowney 
33139ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
33140298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
33159ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
33169ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
33179ae82921SPaul Mullowney 
33189ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
33199ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
33209ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
33219ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
33229ae82921SPaul Mullowney 
33239ae82921SPaul Mullowney    Level: intermediate
33249ae82921SPaul Mullowney 
3325e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
33269ae82921SPaul Mullowney @*/
33279ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
33289ae82921SPaul Mullowney {
33299ae82921SPaul Mullowney   PetscErrorCode ierr;
33309ae82921SPaul Mullowney 
33319ae82921SPaul Mullowney   PetscFunctionBegin;
33329ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
33339ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
33349ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
33359ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
33369ae82921SPaul Mullowney   PetscFunctionReturn(0);
33379ae82921SPaul Mullowney }
33389ae82921SPaul Mullowney 
33396fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
33409ae82921SPaul Mullowney {
33419ae82921SPaul Mullowney   PetscErrorCode ierr;
3342ab25e6cbSDominic Meiser 
33439ae82921SPaul Mullowney   PetscFunctionBegin;
33449ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3345470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
33469ae82921SPaul Mullowney   } else {
3347470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3348aa372e3fSPaul Mullowney   }
3349c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3350ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3351365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetUseCPUSolve_C",NULL);CHKERRQ(ierr);
3352ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3353ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3354fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3355ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
33567e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
33577e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3358ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_seqaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
33599ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
33609ae82921SPaul Mullowney   PetscFunctionReturn(0);
33619ae82921SPaul Mullowney }
33629ae82921SPaul Mullowney 
3363ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
336495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
33659ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
33669ff858a8SKarl Rupp {
33679ff858a8SKarl Rupp   PetscErrorCode ierr;
33689ff858a8SKarl Rupp 
33699ff858a8SKarl Rupp   PetscFunctionBegin;
33709ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3371ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
33729ff858a8SKarl Rupp   PetscFunctionReturn(0);
33739ff858a8SKarl Rupp }
33749ff858a8SKarl Rupp 
3375039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
337695639643SRichard Tran Mills {
3377e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3378a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3379039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3380039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3381039c6fbaSStefano Zampini   PetscScalar        *ay;
3382039c6fbaSStefano Zampini   const PetscScalar  *ax;
3383039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3384e6e9a74fSStefano Zampini 
338595639643SRichard Tran Mills   PetscFunctionBegin;
3386a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3387a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3388039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3389a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3390a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3391a587d139SMark     PetscFunctionReturn(0);
339295639643SRichard Tran Mills   }
3393039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3394a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3395a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3396e8d2b73aSMark Adams   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3397e8d2b73aSMark Adams   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_GPU,"only MAT_CUSPARSE_CSR supported");
3398039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3399039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3400039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3401039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3402039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3403039c6fbaSStefano Zampini     if (eq) {
3404039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3405039c6fbaSStefano Zampini     }
3406039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3407039c6fbaSStefano Zampini   }
3408d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3409d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3410039c6fbaSStefano Zampini 
3411039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3412039c6fbaSStefano Zampini     cusparseStatus_t stat;
3413039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3414039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3415039c6fbaSStefano Zampini     size_t           bufferSize;
3416039c6fbaSStefano Zampini     void             *buffer;
3417ee7b52eaSHong Zhang     cudaError_t      cerr;
3418039c6fbaSStefano Zampini #endif
3419039c6fbaSStefano Zampini 
3420039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3421039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3422039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3423039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3424039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3425039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3426039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3427039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3428039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3429039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3430039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3431039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3432039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3433039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3434039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3435039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3436039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3437039c6fbaSStefano Zampini #else
3438039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3439039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3440039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3441039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3442039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3443039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3444039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3445039c6fbaSStefano Zampini #endif
3446039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3447039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3448039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3449039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3450039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3451a587d139SMark     cublasHandle_t cublasv2handle;
3452039c6fbaSStefano Zampini     cublasStatus_t berr;
3453a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3454039c6fbaSStefano Zampini 
3455039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3456039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3457a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3458a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3459a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3460039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3461a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3462a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3463039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3464039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3465a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3466039c6fbaSStefano Zampini   } else {
3467a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3468d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3469a587d139SMark   }
347095639643SRichard Tran Mills   PetscFunctionReturn(0);
347195639643SRichard Tran Mills }
347295639643SRichard Tran Mills 
347333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
347433c9ba73SStefano Zampini {
347533c9ba73SStefano Zampini   PetscErrorCode ierr;
347633c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
347733c9ba73SStefano Zampini   PetscScalar    *ay;
347833c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
347933c9ba73SStefano Zampini   cublasStatus_t berr;
348033c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
348133c9ba73SStefano Zampini 
348233c9ba73SStefano Zampini   PetscFunctionBegin;
348333c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
348433c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
348533c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
348633c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
348733c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
348833c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
348933c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
349033c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
349133c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
349233c9ba73SStefano Zampini   PetscFunctionReturn(0);
349333c9ba73SStefano Zampini }
349433c9ba73SStefano Zampini 
34953fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
34963fa6b06aSMark Adams {
34973fa6b06aSMark Adams   PetscErrorCode ierr;
34987e8381f9SStefano Zampini   PetscBool      both = PETSC_FALSE;
3499a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
35007e8381f9SStefano Zampini 
35013fa6b06aSMark Adams   PetscFunctionBegin;
35023fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
35033fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
35047e8381f9SStefano Zampini     if (spptr->mat) {
35057e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
35067e8381f9SStefano Zampini       if (matrix->values) {
35077e8381f9SStefano Zampini         both = PETSC_TRUE;
35087e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35097e8381f9SStefano Zampini       }
35107e8381f9SStefano Zampini     }
35117e8381f9SStefano Zampini     if (spptr->matTranspose) {
35127e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
35137e8381f9SStefano Zampini       if (matrix->values) {
35147e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
35157e8381f9SStefano Zampini       }
35167e8381f9SStefano Zampini     }
35173fa6b06aSMark Adams   }
3518a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3519a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3520a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
35217e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3522a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
35233fa6b06aSMark Adams   PetscFunctionReturn(0);
35243fa6b06aSMark Adams }
35253fa6b06aSMark Adams 
3526a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3527a587d139SMark {
3528a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3529a587d139SMark   PetscErrorCode ierr;
3530a587d139SMark 
3531a587d139SMark   PetscFunctionBegin;
35329a14fc28SStefano Zampini   if (A->factortype != MAT_FACTOR_NONE) {
35339a14fc28SStefano Zampini     A->boundtocpu = flg;
35349a14fc28SStefano Zampini     PetscFunctionReturn(0);
35359a14fc28SStefano Zampini   }
3536a587d139SMark   if (flg) {
3537a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3538a587d139SMark 
353933c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3540a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3541a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3542a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3543a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3544a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3545a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3546a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3547a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3548fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
354967a45760SJunchao Zhang     ierr = PetscMemzero(a->ops,sizeof(Mat_SeqAIJOps));CHKERRQ(ierr);
3550c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3551a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3552a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3553a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3554a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3555a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3556fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3557a587d139SMark   } else {
355833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3559a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3560a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3561a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3562a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3563a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3564a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3565a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3566a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3567fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
356867a45760SJunchao Zhang     a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJCUSPARSE;
356967a45760SJunchao Zhang     a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJCUSPARSE;
357067a45760SJunchao Zhang     a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE;
357167a45760SJunchao Zhang     a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE;
357267a45760SJunchao Zhang     a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE;
357367a45760SJunchao Zhang     a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE;
3574c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3575a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3576a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3577a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3578a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3579fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3580a587d139SMark   }
3581a587d139SMark   A->boundtocpu = flg;
3582ea500dcfSRichard Tran Mills   if (flg && a->inode.size) {
3583ea500dcfSRichard Tran Mills     a->inode.use = PETSC_TRUE;
3584ea500dcfSRichard Tran Mills   } else {
3585ea500dcfSRichard Tran Mills     a->inode.use = PETSC_FALSE;
3586ea500dcfSRichard Tran Mills   }
3587a587d139SMark   PetscFunctionReturn(0);
3588a587d139SMark }
3589a587d139SMark 
359049735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
35919ae82921SPaul Mullowney {
35929ae82921SPaul Mullowney   PetscErrorCode   ierr;
3593aa372e3fSPaul Mullowney   cusparseStatus_t stat;
359449735bf3SStefano Zampini   Mat              B;
35959ae82921SPaul Mullowney 
35969ae82921SPaul Mullowney   PetscFunctionBegin;
3597a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
359849735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
359949735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
360049735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
360149735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
360249735bf3SStefano Zampini   }
360349735bf3SStefano Zampini   B = *newmat;
360449735bf3SStefano Zampini 
360534136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
360634136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
360734136279SStefano Zampini 
360849735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
36099ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3610e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3611e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3612e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3613a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
36141a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3615d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3616a435da06SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,4,0)
3617a435da06SStefano Zampini       spptr->spmvAlg    = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */
3618a435da06SStefano Zampini      #else
3619d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3620a435da06SStefano Zampini      #endif
3621d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3622d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3623d8132acaSStefano Zampini      #endif
36241a2c6b5cSJunchao Zhang       B->spptr = spptr;
36259ae82921SPaul Mullowney     } else {
3626e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3627e6e9a74fSStefano Zampini 
3628e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3629e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3630a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3631e6e9a74fSStefano Zampini       B->spptr = spptr;
36329ae82921SPaul Mullowney     }
3633e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
363449735bf3SStefano Zampini   }
3635693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
36369ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
36371a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
36389ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
363995639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3640693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
36412205254eSKarl Rupp 
3642e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
36439ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3644bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
3645ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
3646ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)B,"MatConvert_seqaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
3647ae48a8d0SStefano Zampini #endif
3648365b711fSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetUseCPUSolve_C",MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE);CHKERRQ(ierr);
36499ae82921SPaul Mullowney   PetscFunctionReturn(0);
36509ae82921SPaul Mullowney }
36519ae82921SPaul Mullowney 
365202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
365302fe1965SBarry Smith {
365402fe1965SBarry Smith   PetscErrorCode ierr;
365502fe1965SBarry Smith 
365602fe1965SBarry Smith   PetscFunctionBegin;
365702fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
36580ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
365902fe1965SBarry Smith   PetscFunctionReturn(0);
366002fe1965SBarry Smith }
366102fe1965SBarry Smith 
36623ca39a21SBarry Smith /*MC
3663e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3664e057df02SPaul Mullowney 
3665e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
36662692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
36672692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3668e057df02SPaul Mullowney 
3669e057df02SPaul Mullowney    Options Database Keys:
3670e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3671aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3672a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3673365b711fSMark Adams +  -mat_cusparse_use_cpu_solve - Do MatSolve on CPU
3674e057df02SPaul Mullowney 
3675e057df02SPaul Mullowney   Level: beginner
3676e057df02SPaul Mullowney 
36778468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3678e057df02SPaul Mullowney M*/
36797f756511SDominic Meiser 
3680bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
36810f39cd5aSBarry Smith 
36823ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
368342c9c57cSBarry Smith {
368442c9c57cSBarry Smith   PetscErrorCode ierr;
368542c9c57cSBarry Smith 
368642c9c57cSBarry Smith   PetscFunctionBegin;
3687bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND,MATSEQAIJ,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
36883ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36893ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36903ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
36913ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3692bddcd29dSMark Adams 
369342c9c57cSBarry Smith   PetscFunctionReturn(0);
369442c9c57cSBarry Smith }
369529b38603SBarry Smith 
3696470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
36977f756511SDominic Meiser {
3698e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
36997f756511SDominic Meiser   cusparseStatus_t stat;
37007f756511SDominic Meiser 
37017f756511SDominic Meiser   PetscFunctionBegin;
37027f756511SDominic Meiser   if (*cusparsestruct) {
3703e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3704e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
37057f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
370681902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
37077e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
37087e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3709a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
37107e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3711e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
37127f756511SDominic Meiser   }
37137f756511SDominic Meiser   PetscFunctionReturn(0);
37147f756511SDominic Meiser }
37157f756511SDominic Meiser 
37167f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
37177f756511SDominic Meiser {
37187f756511SDominic Meiser   PetscFunctionBegin;
37197f756511SDominic Meiser   if (*mat) {
37207f756511SDominic Meiser     delete (*mat)->values;
37217f756511SDominic Meiser     delete (*mat)->column_indices;
37227f756511SDominic Meiser     delete (*mat)->row_offsets;
37237f756511SDominic Meiser     delete *mat;
37247f756511SDominic Meiser     *mat = 0;
37257f756511SDominic Meiser   }
37267f756511SDominic Meiser   PetscFunctionReturn(0);
37277f756511SDominic Meiser }
37287f756511SDominic Meiser 
3729470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
37307f756511SDominic Meiser {
37317f756511SDominic Meiser   cusparseStatus_t stat;
37327f756511SDominic Meiser   PetscErrorCode   ierr;
37337f756511SDominic Meiser 
37347f756511SDominic Meiser   PetscFunctionBegin;
37357f756511SDominic Meiser   if (*trifactor) {
373657d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3737afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
37387f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
37391b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
37402cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3741afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
37421b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3743afb2bd1cSJunchao Zhang    #endif
3744da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
37457f756511SDominic Meiser   }
37467f756511SDominic Meiser   PetscFunctionReturn(0);
37477f756511SDominic Meiser }
37487f756511SDominic Meiser 
3749470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
37507f756511SDominic Meiser {
37517f756511SDominic Meiser   CsrMatrix        *mat;
37527f756511SDominic Meiser   cusparseStatus_t stat;
37537f756511SDominic Meiser   cudaError_t      err;
37547f756511SDominic Meiser 
37557f756511SDominic Meiser   PetscFunctionBegin;
37567f756511SDominic Meiser   if (*matstruct) {
37577f756511SDominic Meiser     if ((*matstruct)->mat) {
37587f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3759afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3760afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3761afb2bd1cSJunchao Zhang        #else
37627f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
376357d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3764afb2bd1cSJunchao Zhang        #endif
37657f756511SDominic Meiser       } else {
37667f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
37677f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
37687f756511SDominic Meiser       }
37697f756511SDominic Meiser     }
377057d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
37717f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3772afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
37737656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
37747656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3775afb2bd1cSJunchao Zhang 
3776afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3777afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3778afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3779afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3780afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3781afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3782afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3783afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3784afb2bd1cSJunchao Zhang       }
3785afb2bd1cSJunchao Zhang     }
3786afb2bd1cSJunchao Zhang    #endif
37877f756511SDominic Meiser     delete *matstruct;
37887e8381f9SStefano Zampini     *matstruct = NULL;
37897f756511SDominic Meiser   }
37907f756511SDominic Meiser   PetscFunctionReturn(0);
37917f756511SDominic Meiser }
37927f756511SDominic Meiser 
3793e8d2b73aSMark Adams PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p* trifactors)
37947f756511SDominic Meiser {
3795e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3796e6e9a74fSStefano Zampini 
37977f756511SDominic Meiser   PetscFunctionBegin;
37987f756511SDominic Meiser   if (*trifactors) {
3799e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3800e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3801e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3802e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
38037f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
38047f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
38057f756511SDominic Meiser     delete (*trifactors)->workVector;
38067e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
38077e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
38087e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3809bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3810bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3811e8d2b73aSMark Adams     (*trifactors)->init_dev_prop = PETSC_FALSE;
3812ccdfe979SStefano Zampini   }
3813ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3814ccdfe979SStefano Zampini }
3815ccdfe979SStefano Zampini 
3816ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3817ccdfe979SStefano Zampini {
3818e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3819ccdfe979SStefano Zampini   cusparseHandle_t handle;
3820ccdfe979SStefano Zampini   cusparseStatus_t stat;
3821ccdfe979SStefano Zampini 
3822ccdfe979SStefano Zampini   PetscFunctionBegin;
3823ccdfe979SStefano Zampini   if (*trifactors) {
3824e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
38257f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
382657d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
38277f756511SDominic Meiser     }
3828e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
38297f756511SDominic Meiser   }
38307f756511SDominic Meiser   PetscFunctionReturn(0);
38317f756511SDominic Meiser }
38327e8381f9SStefano Zampini 
38337e8381f9SStefano Zampini struct IJCompare
38347e8381f9SStefano Zampini {
38357e8381f9SStefano Zampini   __host__ __device__
38367e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38377e8381f9SStefano Zampini   {
38387e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
38397e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
38407e8381f9SStefano Zampini     return false;
38417e8381f9SStefano Zampini   }
38427e8381f9SStefano Zampini };
38437e8381f9SStefano Zampini 
38447e8381f9SStefano Zampini struct IJEqual
38457e8381f9SStefano Zampini {
38467e8381f9SStefano Zampini   __host__ __device__
38477e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
38487e8381f9SStefano Zampini   {
38497e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
38507e8381f9SStefano Zampini     return true;
38517e8381f9SStefano Zampini   }
38527e8381f9SStefano Zampini };
38537e8381f9SStefano Zampini 
38547e8381f9SStefano Zampini struct IJDiff
38557e8381f9SStefano Zampini {
38567e8381f9SStefano Zampini   __host__ __device__
38577e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38587e8381f9SStefano Zampini   {
38597e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
38607e8381f9SStefano Zampini   }
38617e8381f9SStefano Zampini };
38627e8381f9SStefano Zampini 
38637e8381f9SStefano Zampini struct IJSum
38647e8381f9SStefano Zampini {
38657e8381f9SStefano Zampini   __host__ __device__
38667e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
38677e8381f9SStefano Zampini   {
38687e8381f9SStefano Zampini     return t1||t2;
38697e8381f9SStefano Zampini   }
38707e8381f9SStefano Zampini };
38717e8381f9SStefano Zampini 
38727e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3873e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
38747e8381f9SStefano Zampini {
38757e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3876fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3877bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
387808391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
38797e8381f9SStefano Zampini   CsrMatrix                             *matrix;
38807e8381f9SStefano Zampini   PetscErrorCode                        ierr;
38817e8381f9SStefano Zampini   PetscInt                              n;
38827e8381f9SStefano Zampini 
38837e8381f9SStefano Zampini   PetscFunctionBegin;
38847e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
38857e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
38867e8381f9SStefano Zampini   if (!cusp->cooPerm) {
38877e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38887e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
38897e8381f9SStefano Zampini     PetscFunctionReturn(0);
38907e8381f9SStefano Zampini   }
38917e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
38927e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3893e61fc153SStefano Zampini   if (!v) {
3894e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3895e61fc153SStefano Zampini     goto finalize;
38967e8381f9SStefano Zampini   }
3897e61fc153SStefano Zampini   n = cusp->cooPerm->size();
389808391a17SStefano Zampini   if (isCudaMem(v)) {
389908391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
390008391a17SStefano Zampini   } else {
3901e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3902e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
390308391a17SStefano Zampini     d_v = cooPerm_v->data();
3904e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
390508391a17SStefano Zampini   }
3906bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3907e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
3908ddea5d60SJunchao Zhang     if (cusp->cooPerm_a) { /* there are repeated entries in d_v[], and we need to add these them */
3909bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
391008391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3911ddea5d60SJunchao Zhang       /* thrust::reduce_by_key(keys_first,keys_last,values_first,keys_output,values_output)
3912ddea5d60SJunchao Zhang         cooPerm_a = [0,0,1,2,3,4]. The length is n, number of nonozeros in d_v[].
3913ddea5d60SJunchao Zhang         cooPerm_a is ordered. d_v[i] is the cooPerm_a[i]-th unique nonzero.
3914ddea5d60SJunchao Zhang       */
3915e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3916e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3917e61fc153SStefano Zampini       delete cooPerm_w;
39187e8381f9SStefano Zampini     } else {
3919ddea5d60SJunchao Zhang       /* all nonzeros in d_v[] are unique entries */
392008391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39217e8381f9SStefano Zampini                                                                 matrix->values->begin()));
392208391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39237e8381f9SStefano Zampini                                                                 matrix->values->end()));
3924ddea5d60SJunchao Zhang       thrust::for_each(zibit,zieit,VecCUDAPlusEquals()); /* values[i] += d_v[cooPerm[i]]  */
39257e8381f9SStefano Zampini     }
39267e8381f9SStefano Zampini   } else {
3927e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
392808391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3929e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
39307e8381f9SStefano Zampini     } else {
393108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
39327e8381f9SStefano Zampini                                                                 matrix->values->begin()));
393308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
39347e8381f9SStefano Zampini                                                                 matrix->values->end()));
39357e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
39367e8381f9SStefano Zampini     }
39377e8381f9SStefano Zampini   }
3938bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3939e61fc153SStefano Zampini finalize:
3940e61fc153SStefano Zampini   delete cooPerm_v;
39417e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3942e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3943fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3944c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo3(A,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3945fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3946c0aa6a63SJacob Faibussowitsch   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",a->rmax);CHKERRQ(ierr);
3947fcdce8c4SStefano Zampini   a->reallocs         = 0;
3948fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3949fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3950fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3951fcdce8c4SStefano Zampini   A->num_ass++;
39527e8381f9SStefano Zampini   PetscFunctionReturn(0);
39537e8381f9SStefano Zampini }
39547e8381f9SStefano Zampini 
3955a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3956a49f1ed0SStefano Zampini {
3957a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3958a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3959a49f1ed0SStefano Zampini 
3960a49f1ed0SStefano Zampini   PetscFunctionBegin;
3961a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3962a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3963a49f1ed0SStefano Zampini   if (destroy) {
3964a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3965a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3966a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3967a49f1ed0SStefano Zampini   }
39681a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3969a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3970a49f1ed0SStefano Zampini }
3971a49f1ed0SStefano Zampini 
39727e8381f9SStefano Zampini #include <thrust/binary_search.h>
3973*82a78a4eSJed Brown PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
39747e8381f9SStefano Zampini {
39757e8381f9SStefano Zampini   PetscErrorCode     ierr;
39767e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
39777e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
39787e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
39797e8381f9SStefano Zampini   cudaError_t        cerr;
39807e8381f9SStefano Zampini 
39817e8381f9SStefano Zampini   PetscFunctionBegin;
39827e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
39837e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
39847e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
39857e8381f9SStefano Zampini   if (n != cooPerm_n) {
39867e8381f9SStefano Zampini     delete cusp->cooPerm;
39877e8381f9SStefano Zampini     delete cusp->cooPerm_a;
39887e8381f9SStefano Zampini     cusp->cooPerm = NULL;
39897e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
39907e8381f9SStefano Zampini   }
39917e8381f9SStefano Zampini   if (n) {
39927e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
39937e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
39947e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
39957e8381f9SStefano Zampini 
39967e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
39977e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
39987e8381f9SStefano Zampini 
39997e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
40007e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
40017e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
4002ddea5d60SJunchao Zhang 
4003ddea5d60SJunchao Zhang     /* Ex.
4004ddea5d60SJunchao Zhang       n = 6
4005ddea5d60SJunchao Zhang       coo_i = [3,3,1,4,1,4]
4006ddea5d60SJunchao Zhang       coo_j = [3,2,2,5,2,6]
4007ddea5d60SJunchao Zhang     */
40087e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
40097e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
40107e8381f9SStefano Zampini 
401108391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
40127e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
4013ddea5d60SJunchao Zhang     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare()); /* sort by row, then by col */
4014ddea5d60SJunchao Zhang     *cusp->cooPerm_a = d_i; /* copy the sorted array */
40157e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
40167e8381f9SStefano Zampini 
4017ddea5d60SJunchao Zhang     /*
4018ddea5d60SJunchao Zhang       d_i     = [1,1,3,3,4,4]
4019ddea5d60SJunchao Zhang       d_j     = [2,2,2,3,5,6]
4020ddea5d60SJunchao Zhang       cooPerm = [2,4,1,0,3,5]
4021ddea5d60SJunchao Zhang     */
4022ddea5d60SJunchao Zhang     auto nekey = thrust::unique(fkey, ekey, IJEqual()); /* unique (d_i, d_j) */
4023ddea5d60SJunchao Zhang 
4024ddea5d60SJunchao Zhang     /*
4025ddea5d60SJunchao Zhang       d_i     = [1,3,3,4,4,x]
4026ddea5d60SJunchao Zhang                             ^ekey
4027ddea5d60SJunchao Zhang       d_j     = [2,2,3,5,6,x]
4028ddea5d60SJunchao Zhang                            ^nekye
4029ddea5d60SJunchao Zhang     */
40307e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
40317e8381f9SStefano Zampini       delete cusp->cooPerm_a;
40327e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
4033ddea5d60SJunchao Zhang     } else { /* Stefano: I couldn't come up with a more elegant algorithm */
4034ddea5d60SJunchao Zhang       /* idea: any change in i or j in the (i,j) sequence implies a new nonzero */
4035ddea5d60SJunchao Zhang       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff()); /* cooPerm_a: [1,1,3,3,4,4] => [1,0,1,0,1,0]*/
4036ddea5d60SJunchao Zhang       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());                                              /* w:         [2,2,2,3,5,6] => [2,0,0,1,1,1]*/
4037ddea5d60SJunchao Zhang       (*cusp->cooPerm_a)[0] = 0; /* clear the first entry, though accessing an entry on device implies a cudaMemcpy */
40387e8381f9SStefano Zampini       w[0] = 0;
4039ddea5d60SJunchao Zhang       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum()); /* cooPerm_a =          [0,0,1,1,1,1]*/
4040ddea5d60SJunchao Zhang       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>()); /*cooPerm_a=[0,0,1,2,3,4]*/
40417e8381f9SStefano Zampini     }
40427e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
4043ddea5d60SJunchao Zhang     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(), /* binary search entries of [0,1,2,3,4,5,6) in ordered array d_i = [1,3,3,4,4], supposing A->rmap->n = 6. */
4044ddea5d60SJunchao Zhang                         search_begin, search_begin + A->rmap->n,  /* return in ii[] the index of last position in d_i[] where value could be inserted without violating the ordering */
4045ddea5d60SJunchao Zhang                         ii.begin()); /* ii = [0,1,1,3,5,5]. A leading 0 will be added later */
404608391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40477e8381f9SStefano Zampini 
40487e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
40497e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
40507e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
40517e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
40527e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
4053ddea5d60SJunchao Zhang     a->i[0] = 0; /* a->i = [0,0,1,1,3,5,5] */
40547e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40557e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
4056fcdce8c4SStefano Zampini     a->rmax = 0;
40577e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
40587e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
40597e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
40607e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
40617e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
40627e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
40637e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
40647e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
40657e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
4066fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
40677e8381f9SStefano Zampini     }
4068fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
40697e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
40707e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
4071fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
40727e8381f9SStefano Zampini   } else {
40737e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
40747e8381f9SStefano Zampini   }
4075e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
40767e8381f9SStefano Zampini 
40777e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
4078e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
4079e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
40807e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
40817e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
40827e8381f9SStefano Zampini   A->nonzerostate++;
40837e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4084a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
40857e8381f9SStefano Zampini 
40867e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
40877e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
40887e8381f9SStefano Zampini   PetscFunctionReturn(0);
40897e8381f9SStefano Zampini }
4090ed502f03SStefano Zampini 
40915b7e41feSStefano Zampini /*@C
40925b7e41feSStefano Zampini     MatSeqAIJCUSPARSEGetIJ - returns the device row storage i and j indices for MATSEQAIJCUSPARSE matrices.
40935b7e41feSStefano Zampini 
40945b7e41feSStefano Zampini    Not collective
40955b7e41feSStefano Zampini 
40965b7e41feSStefano Zampini     Input Parameters:
40975b7e41feSStefano Zampini +   A - the matrix
40985b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
40995b7e41feSStefano Zampini 
41005b7e41feSStefano Zampini     Output Parameters:
41015b7e41feSStefano Zampini +   ia - the CSR row pointers
41025b7e41feSStefano Zampini -   ja - the CSR column indices
41035b7e41feSStefano Zampini 
41045b7e41feSStefano Zampini     Level: developer
41055b7e41feSStefano Zampini 
41065b7e41feSStefano Zampini     Notes:
41075b7e41feSStefano Zampini       When compressed is true, the CSR structure does not contain empty rows
41085b7e41feSStefano Zampini 
41095b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSERestoreIJ(), MatSeqAIJCUSPARSEGetArrayRead()
41105b7e41feSStefano Zampini @*/
41115f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41125f101d05SStefano Zampini {
41135f101d05SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
41145f101d05SStefano Zampini   CsrMatrix          *csr;
41155f101d05SStefano Zampini   PetscErrorCode     ierr;
41165f101d05SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
41175f101d05SStefano Zampini 
41185f101d05SStefano Zampini   PetscFunctionBegin;
41195f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41205f101d05SStefano Zampini   if (!i || !j) PetscFunctionReturn(0);
41215f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41225f101d05SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
41235f101d05SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
41245f101d05SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
41255f101d05SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
41265f101d05SStefano Zampini   if (i) {
41275f101d05SStefano Zampini     if (!compressed && a->compressedrow.use) { /* need full row offset */
41285f101d05SStefano Zampini       if (!cusp->rowoffsets_gpu) {
41295f101d05SStefano Zampini         cusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
41305f101d05SStefano Zampini         cusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
41315f101d05SStefano Zampini         ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
41325f101d05SStefano Zampini       }
41335f101d05SStefano Zampini       *i = cusp->rowoffsets_gpu->data().get();
41345f101d05SStefano Zampini     } else *i = csr->row_offsets->data().get();
41355f101d05SStefano Zampini   }
41365f101d05SStefano Zampini   if (j) *j = csr->column_indices->data().get();
41375f101d05SStefano Zampini   PetscFunctionReturn(0);
41385f101d05SStefano Zampini }
41395f101d05SStefano Zampini 
41405b7e41feSStefano Zampini /*@C
41415b7e41feSStefano Zampini     MatSeqAIJCUSPARSERestoreIJ - restore the device row storage i and j indices obtained with MatSeqAIJCUSPARSEGetIJ()
41425b7e41feSStefano Zampini 
41435b7e41feSStefano Zampini    Not collective
41445b7e41feSStefano Zampini 
41455b7e41feSStefano Zampini     Input Parameters:
41465b7e41feSStefano Zampini +   A - the matrix
41475b7e41feSStefano Zampini -   compressed - PETSC_TRUE or PETSC_FALSE indicating the matrix data structure should be always returned in compressed form
41485b7e41feSStefano Zampini 
41495b7e41feSStefano Zampini     Output Parameters:
41505b7e41feSStefano Zampini +   ia - the CSR row pointers
41515b7e41feSStefano Zampini -   ja - the CSR column indices
41525b7e41feSStefano Zampini 
41535b7e41feSStefano Zampini     Level: developer
41545b7e41feSStefano Zampini 
41555b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetIJ()
41565b7e41feSStefano Zampini @*/
41575f101d05SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int** i, const int **j)
41585f101d05SStefano Zampini {
41595f101d05SStefano Zampini   PetscFunctionBegin;
41605f101d05SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
41615f101d05SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
41625f101d05SStefano Zampini   if (i) *i = NULL;
41635f101d05SStefano Zampini   if (j) *j = NULL;
41645f101d05SStefano Zampini   PetscFunctionReturn(0);
41655f101d05SStefano Zampini }
41665f101d05SStefano Zampini 
41675b7e41feSStefano Zampini /*@C
41685b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
41695b7e41feSStefano Zampini 
41705b7e41feSStefano Zampini    Not Collective
41715b7e41feSStefano Zampini 
41725b7e41feSStefano Zampini    Input Parameter:
41735b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
41745b7e41feSStefano Zampini 
41755b7e41feSStefano Zampini    Output Parameter:
41765b7e41feSStefano Zampini .   a - pointer to the device data
41775b7e41feSStefano Zampini 
41785b7e41feSStefano Zampini    Level: developer
41795b7e41feSStefano Zampini 
41805b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
41815b7e41feSStefano Zampini 
41825b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArrayRead()
41835b7e41feSStefano Zampini @*/
4184ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
4185ed502f03SStefano Zampini {
4186ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4187ed502f03SStefano Zampini   CsrMatrix          *csr;
4188ed502f03SStefano Zampini   PetscErrorCode     ierr;
4189ed502f03SStefano Zampini 
4190ed502f03SStefano Zampini   PetscFunctionBegin;
4191ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4192ed502f03SStefano Zampini   PetscValidPointer(a,2);
4193ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4194ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4195ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
419633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4197ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4198ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4199ed502f03SStefano Zampini   *a = csr->values->data().get();
4200ed502f03SStefano Zampini   PetscFunctionReturn(0);
4201ed502f03SStefano Zampini }
4202ed502f03SStefano Zampini 
42035b7e41feSStefano Zampini /*@C
42045b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from MatSeqAIJCUSPARSEGetArrayRead()
42055b7e41feSStefano Zampini 
42065b7e41feSStefano Zampini    Not Collective
42075b7e41feSStefano Zampini 
42085b7e41feSStefano Zampini    Input Parameter:
42095b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42105b7e41feSStefano Zampini 
42115b7e41feSStefano Zampini    Output Parameter:
42125b7e41feSStefano Zampini .   a - pointer to the device data
42135b7e41feSStefano Zampini 
42145b7e41feSStefano Zampini    Level: developer
42155b7e41feSStefano Zampini 
42165b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead()
42175b7e41feSStefano Zampini @*/
4218ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
4219ed502f03SStefano Zampini {
4220ed502f03SStefano Zampini   PetscFunctionBegin;
4221ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4222ed502f03SStefano Zampini   PetscValidPointer(a,2);
4223ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4224ed502f03SStefano Zampini   *a = NULL;
4225ed502f03SStefano Zampini   PetscFunctionReturn(0);
4226ed502f03SStefano Zampini }
4227ed502f03SStefano Zampini 
42285b7e41feSStefano Zampini /*@C
42295b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42305b7e41feSStefano Zampini 
42315b7e41feSStefano Zampini    Not Collective
42325b7e41feSStefano Zampini 
42335b7e41feSStefano Zampini    Input Parameter:
42345b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42355b7e41feSStefano Zampini 
42365b7e41feSStefano Zampini    Output Parameter:
42375b7e41feSStefano Zampini .   a - pointer to the device data
42385b7e41feSStefano Zampini 
42395b7e41feSStefano Zampini    Level: developer
42405b7e41feSStefano Zampini 
42415b7e41feSStefano Zampini    Notes: may trigger host-device copies if up-to-date matrix data is on host
42425b7e41feSStefano Zampini 
42435b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSEGetArrayWrite(), MatSeqAIJCUSPARSERestoreArray()
42445b7e41feSStefano Zampini @*/
4245039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
4246039c6fbaSStefano Zampini {
4247039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4248039c6fbaSStefano Zampini   CsrMatrix          *csr;
4249039c6fbaSStefano Zampini   PetscErrorCode     ierr;
4250039c6fbaSStefano Zampini 
4251039c6fbaSStefano Zampini   PetscFunctionBegin;
4252039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4253039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4254039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4255039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4256039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
425733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4258039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4259039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4260039c6fbaSStefano Zampini   *a = csr->values->data().get();
4261039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4262a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4263039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4264039c6fbaSStefano Zampini }
42655b7e41feSStefano Zampini /*@C
42665b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from MatSeqAIJCUSPARSEGetArray()
4267039c6fbaSStefano Zampini 
42685b7e41feSStefano Zampini    Not Collective
42695b7e41feSStefano Zampini 
42705b7e41feSStefano Zampini    Input Parameter:
42715b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
42725b7e41feSStefano Zampini 
42735b7e41feSStefano Zampini    Output Parameter:
42745b7e41feSStefano Zampini .   a - pointer to the device data
42755b7e41feSStefano Zampini 
42765b7e41feSStefano Zampini    Level: developer
42775b7e41feSStefano Zampini 
42785b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray()
42795b7e41feSStefano Zampini @*/
4280039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
4281039c6fbaSStefano Zampini {
4282039c6fbaSStefano Zampini   PetscErrorCode ierr;
4283039c6fbaSStefano Zampini 
4284039c6fbaSStefano Zampini   PetscFunctionBegin;
4285039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4286039c6fbaSStefano Zampini   PetscValidPointer(a,2);
4287039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4288039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4289039c6fbaSStefano Zampini   *a = NULL;
4290039c6fbaSStefano Zampini   PetscFunctionReturn(0);
4291039c6fbaSStefano Zampini }
4292039c6fbaSStefano Zampini 
42935b7e41feSStefano Zampini /*@C
42945b7e41feSStefano Zampini    MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a MATSEQAIJCUSPARSE matrix is stored
42955b7e41feSStefano Zampini 
42965b7e41feSStefano Zampini    Not Collective
42975b7e41feSStefano Zampini 
42985b7e41feSStefano Zampini    Input Parameter:
42995b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43005b7e41feSStefano Zampini 
43015b7e41feSStefano Zampini    Output Parameter:
43025b7e41feSStefano Zampini .   a - pointer to the device data
43035b7e41feSStefano Zampini 
43045b7e41feSStefano Zampini    Level: developer
43055b7e41feSStefano Zampini 
43065b7e41feSStefano Zampini    Notes: does not trigger host-device copies and flags data validity on the GPU
43075b7e41feSStefano Zampini 
43085b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArray(), MatSeqAIJCUSPARSEGetArrayRead(), MatSeqAIJCUSPARSERestoreArrayWrite()
43095b7e41feSStefano Zampini @*/
4310ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
4311ed502f03SStefano Zampini {
4312ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
4313ed502f03SStefano Zampini   CsrMatrix          *csr;
4314a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
4315ed502f03SStefano Zampini 
4316ed502f03SStefano Zampini   PetscFunctionBegin;
4317ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4318ed502f03SStefano Zampini   PetscValidPointer(a,2);
4319ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4320ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
432133c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4322ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
4323ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
4324ed502f03SStefano Zampini   *a = csr->values->data().get();
4325039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
4326a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
4327ed502f03SStefano Zampini   PetscFunctionReturn(0);
4328ed502f03SStefano Zampini }
4329ed502f03SStefano Zampini 
43305b7e41feSStefano Zampini /*@C
43315b7e41feSStefano Zampini    MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from MatSeqAIJCUSPARSEGetArrayWrite()
43325b7e41feSStefano Zampini 
43335b7e41feSStefano Zampini    Not Collective
43345b7e41feSStefano Zampini 
43355b7e41feSStefano Zampini    Input Parameter:
43365b7e41feSStefano Zampini .   A - a MATSEQAIJCUSPARSE matrix
43375b7e41feSStefano Zampini 
43385b7e41feSStefano Zampini    Output Parameter:
43395b7e41feSStefano Zampini .   a - pointer to the device data
43405b7e41feSStefano Zampini 
43415b7e41feSStefano Zampini    Level: developer
43425b7e41feSStefano Zampini 
43435b7e41feSStefano Zampini .seealso: MatSeqAIJCUSPARSEGetArrayWrite()
43445b7e41feSStefano Zampini @*/
4345ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
4346ed502f03SStefano Zampini {
4347ed502f03SStefano Zampini   PetscErrorCode ierr;
4348ed502f03SStefano Zampini 
4349ed502f03SStefano Zampini   PetscFunctionBegin;
4350ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4351ed502f03SStefano Zampini   PetscValidPointer(a,2);
4352ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4353ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
4354ed502f03SStefano Zampini   *a = NULL;
4355ed502f03SStefano Zampini   PetscFunctionReturn(0);
4356ed502f03SStefano Zampini }
4357ed502f03SStefano Zampini 
4358ed502f03SStefano Zampini struct IJCompare4
4359ed502f03SStefano Zampini {
4360ed502f03SStefano Zampini   __host__ __device__
43612ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
4362ed502f03SStefano Zampini   {
4363ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
4364ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
4365ed502f03SStefano Zampini     return false;
4366ed502f03SStefano Zampini   }
4367ed502f03SStefano Zampini };
4368ed502f03SStefano Zampini 
43698909a122SStefano Zampini struct Shift
43708909a122SStefano Zampini {
4371ed502f03SStefano Zampini   int _shift;
4372ed502f03SStefano Zampini 
4373ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
4374ed502f03SStefano Zampini   __host__ __device__
4375ed502f03SStefano Zampini   inline int operator() (const int &c)
4376ed502f03SStefano Zampini   {
4377ed502f03SStefano Zampini     return c + _shift;
4378ed502f03SStefano Zampini   }
4379ed502f03SStefano Zampini };
4380ed502f03SStefano Zampini 
4381ddea5d60SJunchao Zhang /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in matlab notation */
4382ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
4383ed502f03SStefano Zampini {
4384ed502f03SStefano Zampini   PetscErrorCode               ierr;
4385ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
4386ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
4387ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
4388ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
4389ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
4390ed502f03SStefano Zampini   cusparseStatus_t             stat;
4391ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
4392ed502f03SStefano Zampini   cudaError_t                  cerr;
4393ed502f03SStefano Zampini 
4394ed502f03SStefano Zampini   PetscFunctionBegin;
4395ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
4396ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
4397ed502f03SStefano Zampini   PetscValidPointer(C,4);
4398ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
4399ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
4400c0aa6a63SJacob Faibussowitsch   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n);
4401ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
4402ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4403ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4404ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4405ed502f03SStefano Zampini     m     = A->rmap->n;
4406ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
4407ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
4408ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
4409ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4410ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
4411ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4412ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
4413ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
4414ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
4415ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
4416ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
4417ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
4418ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
4419ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
4420ed502f03SStefano Zampini     Ccusp->nrows    = m;
4421ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
4422ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
4423ed502f03SStefano Zampini     Ccsr->num_rows  = m;
4424ed502f03SStefano Zampini     Ccsr->num_cols  = n;
4425ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
4426ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4427ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4428ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4429ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4430ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4431ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4432ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4433ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4434ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4435ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4436ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4437ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4438ed502f03SStefano Zampini 
4439ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
4440ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4441ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
4442ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
4443ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
4444ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
4445ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
4446ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
4447ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
4448ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
4449ed502f03SStefano Zampini     if (c->nz) {
44502ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
44512ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
44522ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
44532ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
44542ed87e7eSStefano Zampini 
4455ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
4456ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
4457ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
4458ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
4459ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4460ed502f03SStefano Zampini         }
44612ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
44622ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
4463ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
4464ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
4465ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
4466ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
4467ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
4468ed502f03SStefano Zampini         }
44692ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
44702ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
4471ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
44722ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
44732ed87e7eSStefano Zampini                               Aroff->data().get(),
44742ed87e7eSStefano Zampini                               Annz,
44752ed87e7eSStefano Zampini                               m,
44762ed87e7eSStefano Zampini                               Acoo->data().get(),
44772ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4478ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
44792ed87e7eSStefano Zampini                               Broff->data().get(),
4480ed502f03SStefano Zampini                               Bnnz,
4481ed502f03SStefano Zampini                               m,
44822ed87e7eSStefano Zampini                               Bcoo->data().get(),
4483ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
44842ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
44852ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
44862ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
44878909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4488ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4489ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
44908909a122SStefano Zampini #else
44918909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
44928909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
44938909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
44948909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
44958909a122SStefano Zampini #endif
44962ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
44972ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
44982ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
44992ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
45002ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
45012ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4502ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4503ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4504ed502f03SStefano Zampini       thrust::advance(p2,Annz);
45052ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
45068909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
45078909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
45088909a122SStefano Zampini #endif
45092ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
45102ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
45112ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
45122ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
45132ed87e7eSStefano Zampini #else
45142ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
45152ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
45162ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
45172ed87e7eSStefano Zampini #endif
4518ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
45192ed87e7eSStefano Zampini                               Ccoo->data().get(),
4520ed502f03SStefano Zampini                               c->nz,
4521ed502f03SStefano Zampini                               m,
4522ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4523ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4524ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
45252ed87e7eSStefano Zampini       delete wPerm;
45262ed87e7eSStefano Zampini       delete Acoo;
45272ed87e7eSStefano Zampini       delete Bcoo;
45282ed87e7eSStefano Zampini       delete Ccoo;
4529ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4530ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4531ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4532ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4533ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4534ed502f03SStefano Zampini #endif
45351a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
45363606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(A);CHKERRQ(ierr);
45373606e59fSJunchao Zhang         ierr = MatSeqAIJCUSPARSEFormExplicitTranspose(B);CHKERRQ(ierr);
4538ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4539ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4540ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4541ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4542ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4543ed502f03SStefano Zampini 
45441a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
45451a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4546a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4547ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4548ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4549ed502f03SStefano Zampini         CcsrT->num_rows = n;
4550ed502f03SStefano Zampini         CcsrT->num_cols = m;
4551ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4552ed502f03SStefano Zampini 
4553ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4554ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4555ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4556ed502f03SStefano Zampini 
4557ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4558ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4559ed502f03SStefano Zampini         if (AT) {
4560ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4561ed502f03SStefano Zampini           thrust::advance(rT,-1);
4562ed502f03SStefano Zampini         }
4563ed502f03SStefano Zampini         if (BT) {
4564ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4565ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4566ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4567ed502f03SStefano Zampini         }
4568ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4569ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4570ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4571ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4572ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4573ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4574ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4575ed502f03SStefano Zampini 
4576ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4577ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4578ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4579ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4580ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4581ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4582ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4583ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4584ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4585ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4586ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4587ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4588ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4589ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4590ed502f03SStefano Zampini #endif
4591ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4592ed502f03SStefano Zampini       }
4593ed502f03SStefano Zampini     }
4594ed502f03SStefano Zampini 
4595ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4596ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4597ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4598ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4599ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4600ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4601ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4602ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4603ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4604ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4605ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4606ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4607ed502f03SStefano Zampini     } else {
4608ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4609ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4610ed502f03SStefano Zampini     }
4611ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4612ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4613ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4614ed502f03SStefano Zampini     c->maxnz = c->nz;
4615ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4616ed502f03SStefano Zampini     c->rmax = 0;
4617ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4618ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4619ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4620ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4621ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4622ed502f03SStefano Zampini     }
4623ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4624ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4625ed502f03SStefano Zampini     (*C)->nonzerostate++;
4626ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4627ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4628ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4629ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4630ed502f03SStefano Zampini   } else {
4631c0aa6a63SJacob Faibussowitsch     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,(*C)->rmap->n,B->rmap->n);
4632ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4633ed502f03SStefano Zampini     if (c->nz) {
4634ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4635ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4636ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4637ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4638ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4639ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4640ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4641ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4642ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4643ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4644ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4645c0aa6a63SJacob Faibussowitsch       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %" PetscInt_FMT " != %" PetscInt_FMT,Acsr->num_entries,(PetscInt)Acsr->values->size());
4646c0aa6a63SJacob Faibussowitsch       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %" PetscInt_FMT " != %" PetscInt_FMT,Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4647c0aa6a63SJacob Faibussowitsch       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT,Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4648c0aa6a63SJacob Faibussowitsch       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT,Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4649c0aa6a63SJacob Faibussowitsch       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %" PetscInt_FMT " != %" PetscInt_FMT,(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4650ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4651ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4652ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4653ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4654ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4655ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4656ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4657ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4658ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4659ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4660ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4661ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4662ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4663a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
46641a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4665ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4666ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4667ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4668ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4669ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4670ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4671ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4672ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
46731a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4674ed502f03SStefano Zampini       }
4675ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4676ed502f03SStefano Zampini     }
4677ed502f03SStefano Zampini   }
4678ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4679ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4680ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4681ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4682ed502f03SStefano Zampini   PetscFunctionReturn(0);
4683ed502f03SStefano Zampini }
4684c215019aSStefano Zampini 
4685c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4686c215019aSStefano Zampini {
4687c215019aSStefano Zampini   PetscErrorCode    ierr;
4688c215019aSStefano Zampini   bool              dmem;
4689c215019aSStefano Zampini   const PetscScalar *av;
4690c215019aSStefano Zampini   cudaError_t       cerr;
4691c215019aSStefano Zampini 
4692c215019aSStefano Zampini   PetscFunctionBegin;
4693c215019aSStefano Zampini   dmem = isCudaMem(v);
4694c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4695c215019aSStefano Zampini   if (n && idx) {
4696c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4697c215019aSStefano Zampini     widx.assign(idx,idx+n);
4698c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4699c215019aSStefano Zampini 
4700c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4701c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4702c215019aSStefano Zampini     if (dmem) {
4703c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4704c215019aSStefano Zampini     } else {
4705c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4706c215019aSStefano Zampini       dv = w->data();
4707c215019aSStefano Zampini     }
4708c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4709c215019aSStefano Zampini 
4710c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4711c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4712c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4713c215019aSStefano Zampini     if (w) {
4714c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4715c215019aSStefano Zampini     }
4716c215019aSStefano Zampini     delete w;
4717c215019aSStefano Zampini   } else {
4718c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4719c215019aSStefano Zampini   }
4720c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4721c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4722c215019aSStefano Zampini   PetscFunctionReturn(0);
4723c215019aSStefano Zampini }
4724