xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision a49f1ed07369b0148ac1ebfef7e25a6063171f6f)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16bc3f50f2SPaul Mullowney 
17e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
18afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
19afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
20afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
21afb2bd1cSJunchao Zhang 
22afb2bd1cSJunchao Zhang   typedef enum {
23afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
24afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
25afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
27afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
28afb2bd1cSJunchao Zhang 
29afb2bd1cSJunchao Zhang   typedef enum {
30afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
31afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
34afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
35afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
42afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
43afb2bd1cSJunchao Zhang 
44afb2bd1cSJunchao Zhang   typedef enum {
45afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
47afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
48afb2bd1cSJunchao Zhang   */
49afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
52afb2bd1cSJunchao Zhang #endif
539ae82921SPaul Mullowney 
54087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
55087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
57087f3262SPaul Mullowney 
586fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
596fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
61087f3262SPaul Mullowney 
626fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
664416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
67a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6833c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
696fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
73e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
769ae82921SPaul Mullowney 
777f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
78470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
80ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
81470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
837f756511SDominic Meiser 
8457181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
86*a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8757181aedSStefano Zampini 
887e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
907e8381f9SStefano Zampini 
91c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
92c215019aSStefano Zampini 
93b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
94b06137fdSPaul Mullowney {
95b06137fdSPaul Mullowney   cusparseStatus_t   stat;
96b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
97b06137fdSPaul Mullowney 
98b06137fdSPaul Mullowney   PetscFunctionBegin;
99d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
100b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10157d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
102b06137fdSPaul Mullowney   PetscFunctionReturn(0);
103b06137fdSPaul Mullowney }
104b06137fdSPaul Mullowney 
105b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
106b06137fdSPaul Mullowney {
107b06137fdSPaul Mullowney   cusparseStatus_t   stat;
108b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney   PetscFunctionBegin;
111d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1126b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11316a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11457d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11516a2e217SAlejandro Lamas Daviña     }
116b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1176b1cf21dSAlejandro Lamas Daviña   }
11857d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
119b06137fdSPaul Mullowney   PetscFunctionReturn(0);
120b06137fdSPaul Mullowney }
121b06137fdSPaul Mullowney 
122b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
123b06137fdSPaul Mullowney {
124b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1257e8381f9SStefano Zampini   PetscBool          flg;
1267e8381f9SStefano Zampini   PetscErrorCode     ierr;
127ccdfe979SStefano Zampini 
128b06137fdSPaul Mullowney   PetscFunctionBegin;
1297e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1307e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
131ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
132b06137fdSPaul Mullowney   PetscFunctionReturn(0);
133b06137fdSPaul Mullowney }
134b06137fdSPaul Mullowney 
135ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1369ae82921SPaul Mullowney {
1379ae82921SPaul Mullowney   PetscFunctionBegin;
1389ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1399ae82921SPaul Mullowney   PetscFunctionReturn(0);
1409ae82921SPaul Mullowney }
1419ae82921SPaul Mullowney 
142c708e6cdSJed Brown /*MC
143087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
144087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
145087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
146087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
147087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
148087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
149c708e6cdSJed Brown 
1509ae82921SPaul Mullowney   Level: beginner
151c708e6cdSJed Brown 
1523ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
153c708e6cdSJed Brown M*/
1549ae82921SPaul Mullowney 
15542c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1569ae82921SPaul Mullowney {
1579ae82921SPaul Mullowney   PetscErrorCode ierr;
158bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1599ae82921SPaul Mullowney 
1609ae82921SPaul Mullowney   PetscFunctionBegin;
161bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
162bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1632c7c0729SBarry Smith   (*B)->factortype = ftype;
1642c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1659ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1662205254eSKarl Rupp 
167087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16833d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1699ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1709ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
171087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
172087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
173087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1749ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
175bc3f50f2SPaul Mullowney 
176fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1773ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1789ae82921SPaul Mullowney   PetscFunctionReturn(0);
1799ae82921SPaul Mullowney }
1809ae82921SPaul Mullowney 
181bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
182ca45077fSPaul Mullowney {
183aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1846e111a19SKarl Rupp 
185ca45077fSPaul Mullowney   PetscFunctionBegin;
186ca45077fSPaul Mullowney   switch (op) {
187e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
188aa372e3fSPaul Mullowney     cusparsestruct->format = format;
189ca45077fSPaul Mullowney     break;
190e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
191aa372e3fSPaul Mullowney     cusparsestruct->format = format;
192ca45077fSPaul Mullowney     break;
193ca45077fSPaul Mullowney   default:
19436d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
195ca45077fSPaul Mullowney   }
196ca45077fSPaul Mullowney   PetscFunctionReturn(0);
197ca45077fSPaul Mullowney }
1989ae82921SPaul Mullowney 
199e057df02SPaul Mullowney /*@
200e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
201e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
202aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
203e057df02SPaul Mullowney    Not Collective
204e057df02SPaul Mullowney 
205e057df02SPaul Mullowney    Input Parameters:
2068468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20736d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2082692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
209e057df02SPaul Mullowney 
210e057df02SPaul Mullowney    Output Parameter:
211e057df02SPaul Mullowney 
212e057df02SPaul Mullowney    Level: intermediate
213e057df02SPaul Mullowney 
2148468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
215e057df02SPaul Mullowney @*/
216e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
217e057df02SPaul Mullowney {
218e057df02SPaul Mullowney   PetscErrorCode ierr;
2196e111a19SKarl Rupp 
220e057df02SPaul Mullowney   PetscFunctionBegin;
221e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
222e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
223e057df02SPaul Mullowney   PetscFunctionReturn(0);
224e057df02SPaul Mullowney }
225e057df02SPaul Mullowney 
226e6e9a74fSStefano Zampini /*@
227e589036eSStefano Zampini    MatSeqAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
228e6e9a74fSStefano Zampini 
229e6e9a74fSStefano Zampini    Collective on mat
230e6e9a74fSStefano Zampini 
231e6e9a74fSStefano Zampini    Input Parameters:
232e6e9a74fSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE
233e6e9a74fSStefano Zampini -  transgen - the boolean flag
234e6e9a74fSStefano Zampini 
235e6e9a74fSStefano Zampini    Level: intermediate
236e6e9a74fSStefano Zampini 
237e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MatAIJCUSPARSESetGenerateTranspose()
238e6e9a74fSStefano Zampini @*/
239e6e9a74fSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSESetGenerateTranspose(Mat A,PetscBool transgen)
240e6e9a74fSStefano Zampini {
241e6e9a74fSStefano Zampini   PetscErrorCode ierr;
242e6e9a74fSStefano Zampini   PetscBool      flg;
243e6e9a74fSStefano Zampini 
244e6e9a74fSStefano Zampini   PetscFunctionBegin;
245e6e9a74fSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
246e6e9a74fSStefano Zampini   ierr = PetscObjectTypeCompare(((PetscObject)A),MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
247e6e9a74fSStefano Zampini   if (flg) {
248e6e9a74fSStefano Zampini     Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
24954da937aSStefano Zampini 
250e6e9a74fSStefano Zampini     if (A->factortype) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
251e6e9a74fSStefano Zampini     cusp->transgen = transgen;
25254da937aSStefano Zampini     if (!transgen) { /* need to destroy the transpose matrix if present to prevent from logic errors if transgen is set to true later */
253*a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
25454da937aSStefano Zampini     }
255e6e9a74fSStefano Zampini   }
256e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
257e6e9a74fSStefano Zampini }
258e6e9a74fSStefano Zampini 
2594416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2609ae82921SPaul Mullowney {
2619ae82921SPaul Mullowney   PetscErrorCode           ierr;
262e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2639ae82921SPaul Mullowney   PetscBool                flg;
264a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2656e111a19SKarl Rupp 
2669ae82921SPaul Mullowney   PetscFunctionBegin;
267e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2689ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
26954da937aSStefano Zampini     PetscBool transgen = cusparsestruct->transgen;
27054da937aSStefano Zampini 
27154da937aSStefano Zampini     ierr = PetscOptionsBool("-mat_cusparse_transgen","Generate explicit transpose for MatMultTranspose","MatSeqAIJCUSPARSESetGenerateTranspose",transgen,&transgen,&flg);CHKERRQ(ierr);
272afb2bd1cSJunchao Zhang     if (flg) {ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,transgen);CHKERRQ(ierr);}
273afb2bd1cSJunchao Zhang 
274e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
275a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
276afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
277afb2bd1cSJunchao Zhang 
2784c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
279a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
280afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
281afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
282afb2bd1cSJunchao Zhang     cusparsestruct->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */
283afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
284afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
285afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
286afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
287afb2bd1cSJunchao Zhang 
288afb2bd1cSJunchao Zhang     cusparsestruct->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
289afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
290afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
291afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
292afb2bd1cSJunchao Zhang 
293afb2bd1cSJunchao Zhang     cusparsestruct->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
294afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
295afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
296afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
297afb2bd1cSJunchao Zhang    #endif
2984c87dfd4SPaul Mullowney   }
2990af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3009ae82921SPaul Mullowney   PetscFunctionReturn(0);
3019ae82921SPaul Mullowney }
3029ae82921SPaul Mullowney 
3036fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3049ae82921SPaul Mullowney {
305da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3069ae82921SPaul Mullowney   PetscErrorCode               ierr;
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney   PetscFunctionBegin;
309da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3109ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3119ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3129ae82921SPaul Mullowney   PetscFunctionReturn(0);
3139ae82921SPaul Mullowney }
3149ae82921SPaul Mullowney 
3156fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3169ae82921SPaul Mullowney {
317da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3189ae82921SPaul Mullowney   PetscErrorCode               ierr;
3199ae82921SPaul Mullowney 
3209ae82921SPaul Mullowney   PetscFunctionBegin;
321da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3229ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3239ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3249ae82921SPaul Mullowney   PetscFunctionReturn(0);
3259ae82921SPaul Mullowney }
3269ae82921SPaul Mullowney 
327087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
328087f3262SPaul Mullowney {
329da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
330087f3262SPaul Mullowney   PetscErrorCode               ierr;
331087f3262SPaul Mullowney 
332087f3262SPaul Mullowney   PetscFunctionBegin;
333da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
334087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
335087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
336087f3262SPaul Mullowney   PetscFunctionReturn(0);
337087f3262SPaul Mullowney }
338087f3262SPaul Mullowney 
339087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
340087f3262SPaul Mullowney {
341da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
342087f3262SPaul Mullowney   PetscErrorCode               ierr;
343087f3262SPaul Mullowney 
344087f3262SPaul Mullowney   PetscFunctionBegin;
345da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
346087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
347087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
348087f3262SPaul Mullowney   PetscFunctionReturn(0);
349087f3262SPaul Mullowney }
350087f3262SPaul Mullowney 
351087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3529ae82921SPaul Mullowney {
3539ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3549ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3559ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
356aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3579ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3589ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3599ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3609ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3619ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
362b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
36357d48284SJunchao Zhang   cudaError_t                       cerr;
3649ae82921SPaul Mullowney 
3659ae82921SPaul Mullowney   PetscFunctionBegin;
366cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
367c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3689ae82921SPaul Mullowney     try {
3699ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3709ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
371da79fbbcSStefano Zampini       if (!loTriFactor) {
3722cbc15d9SMark         PetscScalar                       *AALo;
3732cbc15d9SMark 
3742cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
37757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
37857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3799ae82921SPaul Mullowney 
3809ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3819ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3829ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3839ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3849ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3859ae82921SPaul Mullowney         v        = aa;
3869ae82921SPaul Mullowney         vi       = aj;
3879ae82921SPaul Mullowney         offset   = 1;
3889ae82921SPaul Mullowney         rowOffset= 1;
3899ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3909ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
391e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3929ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3939ae82921SPaul Mullowney           rowOffset += nz+1;
3949ae82921SPaul Mullowney 
395580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
396580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3979ae82921SPaul Mullowney 
3989ae82921SPaul Mullowney           offset      += nz;
3999ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4009ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4019ae82921SPaul Mullowney           offset      += 1;
4029ae82921SPaul Mullowney 
4039ae82921SPaul Mullowney           v  += nz;
4049ae82921SPaul Mullowney           vi += nz;
4059ae82921SPaul Mullowney         }
4062205254eSKarl Rupp 
407aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
408da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
409da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
410aa372e3fSPaul Mullowney         /* Create the matrix description */
41157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
41257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4131b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
414afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
415afb2bd1cSJunchao Zhang        #else
41657d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
417afb2bd1cSJunchao Zhang        #endif
41857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
41957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
420aa372e3fSPaul Mullowney 
421aa372e3fSPaul Mullowney         /* set the operation */
422aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
423aa372e3fSPaul Mullowney 
424aa372e3fSPaul Mullowney         /* set the matrix */
425aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
426aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
427aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
428aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
429aa372e3fSPaul Mullowney 
430aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
431aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
432aa372e3fSPaul Mullowney 
433aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
434aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
435aa372e3fSPaul Mullowney 
436aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
437aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
438aa372e3fSPaul Mullowney 
439afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
440da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
441afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4421b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
443afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
444afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
445afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
446afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
447afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
448afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
449afb2bd1cSJunchao Zhang       #endif
450afb2bd1cSJunchao Zhang 
451aa372e3fSPaul Mullowney         /* perform the solve analysis */
452aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
453aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
454aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
455afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4561b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
457afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
458afb2bd1cSJunchao Zhang                                #endif
459afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
460da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
461da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
462aa372e3fSPaul Mullowney 
463da79fbbcSStefano Zampini         /* assign the pointer */
464aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4652cbc15d9SMark         loTriFactor->AA_h = AALo;
46657d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
46757d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4684863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
469da79fbbcSStefano Zampini       } else { /* update values only */
4702cbc15d9SMark         if (!loTriFactor->AA_h) {
4712cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4722cbc15d9SMark         }
473da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4742cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
475da79fbbcSStefano Zampini         v        = aa;
476da79fbbcSStefano Zampini         vi       = aj;
477da79fbbcSStefano Zampini         offset   = 1;
478da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
479da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4802cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
481da79fbbcSStefano Zampini           offset      += nz;
4822cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
483da79fbbcSStefano Zampini           offset      += 1;
484da79fbbcSStefano Zampini           v  += nz;
485da79fbbcSStefano Zampini         }
4862cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
487da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
488da79fbbcSStefano Zampini       }
4899ae82921SPaul Mullowney     } catch(char *ex) {
4909ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4919ae82921SPaul Mullowney     }
4929ae82921SPaul Mullowney   }
4939ae82921SPaul Mullowney   PetscFunctionReturn(0);
4949ae82921SPaul Mullowney }
4959ae82921SPaul Mullowney 
496087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4979ae82921SPaul Mullowney {
4989ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4999ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5009ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
501aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5029ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5039ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5049ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5059ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5069ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5079ae82921SPaul Mullowney   PetscErrorCode                    ierr;
50857d48284SJunchao Zhang   cudaError_t                       cerr;
5099ae82921SPaul Mullowney 
5109ae82921SPaul Mullowney   PetscFunctionBegin;
511cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
512c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5139ae82921SPaul Mullowney     try {
5149ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5159ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
516da79fbbcSStefano Zampini       if (!upTriFactor) {
5172cbc15d9SMark         PetscScalar *AAUp;
5182cbc15d9SMark 
5192cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5202cbc15d9SMark 
5219ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
52257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
52357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5249ae82921SPaul Mullowney 
5259ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5269ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5279ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5289ae82921SPaul Mullowney         offset = nzUpper;
5299ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5309ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5319ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5329ae82921SPaul Mullowney 
533e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5349ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5359ae82921SPaul Mullowney 
536e057df02SPaul Mullowney           /* decrement the offset */
5379ae82921SPaul Mullowney           offset -= (nz+1);
5389ae82921SPaul Mullowney 
539e057df02SPaul Mullowney           /* first, set the diagonal elements */
5409ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
54109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5429ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5439ae82921SPaul Mullowney 
544580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
545580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5469ae82921SPaul Mullowney         }
5472205254eSKarl Rupp 
548aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
549da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
550da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5512205254eSKarl Rupp 
552aa372e3fSPaul Mullowney         /* Create the matrix description */
55357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
55457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5551b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
556afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
557afb2bd1cSJunchao Zhang        #else
55857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
559afb2bd1cSJunchao Zhang        #endif
56057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
56157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
562aa372e3fSPaul Mullowney 
563aa372e3fSPaul Mullowney         /* set the operation */
564aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
565aa372e3fSPaul Mullowney 
566aa372e3fSPaul Mullowney         /* set the matrix */
567aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
568aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
569aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
570aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
571aa372e3fSPaul Mullowney 
572aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
573aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
574aa372e3fSPaul Mullowney 
575aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
576aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
577aa372e3fSPaul Mullowney 
578aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
579aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
580aa372e3fSPaul Mullowney 
581afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
582da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
583afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5841b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
585afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
586afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
587afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
588afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
589afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
590afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
591afb2bd1cSJunchao Zhang       #endif
592afb2bd1cSJunchao Zhang 
593aa372e3fSPaul Mullowney         /* perform the solve analysis */
594aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
595aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
596aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
597afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5981b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
599afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
600afb2bd1cSJunchao Zhang                                #endif
601afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
602da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
603da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
604aa372e3fSPaul Mullowney 
605da79fbbcSStefano Zampini         /* assign the pointer */
606aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6072cbc15d9SMark         upTriFactor->AA_h = AAUp;
60857d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
60957d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6104863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
611da79fbbcSStefano Zampini       } else {
6122cbc15d9SMark         if (!upTriFactor->AA_h) {
6132cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6142cbc15d9SMark         }
615da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
616da79fbbcSStefano Zampini         offset = nzUpper;
617da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
618da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
619da79fbbcSStefano Zampini 
620da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
621da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
622da79fbbcSStefano Zampini 
623da79fbbcSStefano Zampini           /* decrement the offset */
624da79fbbcSStefano Zampini           offset -= (nz+1);
625da79fbbcSStefano Zampini 
626da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6272cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6282cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
629da79fbbcSStefano Zampini         }
6302cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
631da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
632da79fbbcSStefano Zampini       }
6339ae82921SPaul Mullowney     } catch(char *ex) {
6349ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6359ae82921SPaul Mullowney     }
6369ae82921SPaul Mullowney   }
6379ae82921SPaul Mullowney   PetscFunctionReturn(0);
6389ae82921SPaul Mullowney }
6399ae82921SPaul Mullowney 
640087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6419ae82921SPaul Mullowney {
6429ae82921SPaul Mullowney   PetscErrorCode               ierr;
6439ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6449ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6459ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6469ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6479ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6489ae82921SPaul Mullowney 
6499ae82921SPaul Mullowney   PetscFunctionBegin;
650da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
651087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
652087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6532205254eSKarl Rupp 
654da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
655aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6569ae82921SPaul Mullowney 
657c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
658e057df02SPaul Mullowney   /* lower triangular indices */
6599ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
660da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
661da79fbbcSStefano Zampini     const PetscInt *r;
662da79fbbcSStefano Zampini 
663da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
664aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
665aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6669ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
667da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
668da79fbbcSStefano Zampini   }
6699ae82921SPaul Mullowney 
670e057df02SPaul Mullowney   /* upper triangular indices */
6719ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
672da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
673da79fbbcSStefano Zampini     const PetscInt *c;
674da79fbbcSStefano Zampini 
675da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
676aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
677aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6789ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
679da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
680da79fbbcSStefano Zampini   }
6819ae82921SPaul Mullowney   PetscFunctionReturn(0);
6829ae82921SPaul Mullowney }
6839ae82921SPaul Mullowney 
684087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
685087f3262SPaul Mullowney {
686087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
687087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
688aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
689aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
690087f3262SPaul Mullowney   cusparseStatus_t                  stat;
691087f3262SPaul Mullowney   PetscErrorCode                    ierr;
69257d48284SJunchao Zhang   cudaError_t                       cerr;
693087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
694087f3262SPaul Mullowney   PetscScalar                       *AAUp;
695087f3262SPaul Mullowney   PetscScalar                       *AALo;
696087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
697087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
698087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
699087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
700087f3262SPaul Mullowney 
701087f3262SPaul Mullowney   PetscFunctionBegin;
702cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
703c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
704087f3262SPaul Mullowney     try {
705da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
706da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
707da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
708087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
70957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
71057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
711087f3262SPaul Mullowney 
712087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
713087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
714087f3262SPaul Mullowney         AiUp[n]=nzUpper;
715087f3262SPaul Mullowney         offset = 0;
716087f3262SPaul Mullowney         for (i=0; i<n; i++) {
717087f3262SPaul Mullowney           /* set the pointers */
718087f3262SPaul Mullowney           v  = aa + ai[i];
719087f3262SPaul Mullowney           vj = aj + ai[i];
720087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
721087f3262SPaul Mullowney 
722087f3262SPaul Mullowney           /* first, set the diagonal elements */
723087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
72409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
725087f3262SPaul Mullowney           AiUp[i]      = offset;
72609f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
727087f3262SPaul Mullowney 
728087f3262SPaul Mullowney           offset+=1;
729087f3262SPaul Mullowney           if (nz>0) {
730f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
731580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
732087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
733087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
734087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
735087f3262SPaul Mullowney             }
736087f3262SPaul Mullowney             offset+=nz;
737087f3262SPaul Mullowney           }
738087f3262SPaul Mullowney         }
739087f3262SPaul Mullowney 
740aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
741da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
742da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
743087f3262SPaul Mullowney 
744aa372e3fSPaul Mullowney         /* Create the matrix description */
74557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
74657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7471b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
748afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
749afb2bd1cSJunchao Zhang        #else
75057d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
751afb2bd1cSJunchao Zhang        #endif
75257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
75357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
754087f3262SPaul Mullowney 
755aa372e3fSPaul Mullowney         /* set the matrix */
756aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
757aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
758aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
759aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
760aa372e3fSPaul Mullowney 
761aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
762aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
763aa372e3fSPaul Mullowney 
764aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
765aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
766aa372e3fSPaul Mullowney 
767aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
768aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
769aa372e3fSPaul Mullowney 
770afb2bd1cSJunchao Zhang         /* set the operation */
771afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
772afb2bd1cSJunchao Zhang 
773afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
774da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
775afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7761b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
777afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
778afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
779afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
780afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
781afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
782afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
783afb2bd1cSJunchao Zhang       #endif
784afb2bd1cSJunchao Zhang 
785aa372e3fSPaul Mullowney         /* perform the solve analysis */
786aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
787aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
788aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
789afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7901b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
791afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
792afb2bd1cSJunchao Zhang                                 #endif
793afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
794da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
795da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
796aa372e3fSPaul Mullowney 
797da79fbbcSStefano Zampini         /* assign the pointer */
798aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
799aa372e3fSPaul Mullowney 
800aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
801da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
802da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
803aa372e3fSPaul Mullowney 
804aa372e3fSPaul Mullowney         /* Create the matrix description */
80557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
80657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8071b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
808afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
809afb2bd1cSJunchao Zhang        #else
81057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
811afb2bd1cSJunchao Zhang        #endif
81257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
81357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
814aa372e3fSPaul Mullowney 
815aa372e3fSPaul Mullowney         /* set the operation */
816aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
817aa372e3fSPaul Mullowney 
818aa372e3fSPaul Mullowney         /* set the matrix */
819aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
820aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
821aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
822aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
823aa372e3fSPaul Mullowney 
824aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
825aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
826aa372e3fSPaul Mullowney 
827aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
828aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
829aa372e3fSPaul Mullowney 
830aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
831aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
832aa372e3fSPaul Mullowney 
833afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
834da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
835afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8361b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
837afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
838afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
839afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
840afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
841afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
842afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
843afb2bd1cSJunchao Zhang       #endif
844afb2bd1cSJunchao Zhang 
845aa372e3fSPaul Mullowney         /* perform the solve analysis */
846aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
847aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
848aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
849afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8501b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
851afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
852afb2bd1cSJunchao Zhang                                 #endif
853afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
854da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
855da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
856aa372e3fSPaul Mullowney 
857da79fbbcSStefano Zampini         /* assign the pointer */
858aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
859087f3262SPaul Mullowney 
860da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
86157d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
86257d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
863da79fbbcSStefano Zampini       } else {
864da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
865da79fbbcSStefano Zampini         offset = 0;
866da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
867da79fbbcSStefano Zampini           /* set the pointers */
868da79fbbcSStefano Zampini           v  = aa + ai[i];
869da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
870da79fbbcSStefano Zampini 
871da79fbbcSStefano Zampini           /* first, set the diagonal elements */
872da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
873da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
874da79fbbcSStefano Zampini 
875da79fbbcSStefano Zampini           offset+=1;
876da79fbbcSStefano Zampini           if (nz>0) {
877da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
878da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
879da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
880da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
881da79fbbcSStefano Zampini             }
882da79fbbcSStefano Zampini             offset+=nz;
883da79fbbcSStefano Zampini           }
884da79fbbcSStefano Zampini         }
885da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
886da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
887da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
888da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
889da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
890da79fbbcSStefano Zampini       }
89157d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
89257d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
893087f3262SPaul Mullowney     } catch(char *ex) {
894087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
895087f3262SPaul Mullowney     }
896087f3262SPaul Mullowney   }
897087f3262SPaul Mullowney   PetscFunctionReturn(0);
898087f3262SPaul Mullowney }
899087f3262SPaul Mullowney 
900087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9019ae82921SPaul Mullowney {
9029ae82921SPaul Mullowney   PetscErrorCode               ierr;
903087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
904087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
905087f3262SPaul Mullowney   IS                           ip = a->row;
906087f3262SPaul Mullowney   PetscBool                    perm_identity;
907087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
908087f3262SPaul Mullowney 
909087f3262SPaul Mullowney   PetscFunctionBegin;
910da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
911087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
912da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
913aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
914aa372e3fSPaul Mullowney 
915da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
916da79fbbcSStefano Zampini 
917087f3262SPaul Mullowney   /* lower triangular indices */
918087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
919087f3262SPaul Mullowney   if (!perm_identity) {
9204e4bbfaaSStefano Zampini     IS             iip;
921da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9224e4bbfaaSStefano Zampini 
9234e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9244e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
925da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
926aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
927aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
928aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9294e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9304e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9314e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
932087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
933da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
934da79fbbcSStefano Zampini   }
935087f3262SPaul Mullowney   PetscFunctionReturn(0);
936087f3262SPaul Mullowney }
937087f3262SPaul Mullowney 
9386fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9399ae82921SPaul Mullowney {
9409ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9419ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9429ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
943b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9449ae82921SPaul Mullowney 
9459ae82921SPaul Mullowney   PetscFunctionBegin;
94657181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9479ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
948ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
949e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9509ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9519ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
952bda325fcSPaul Mullowney   if (row_identity && col_identity) {
953bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
954bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9554e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9564e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
957bda325fcSPaul Mullowney   } else {
958bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
959bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9604e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9614e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
962bda325fcSPaul Mullowney   }
9638dc1d2a3SPaul Mullowney 
964e057df02SPaul Mullowney   /* get the triangular factors */
965087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9669ae82921SPaul Mullowney   PetscFunctionReturn(0);
9679ae82921SPaul Mullowney }
9689ae82921SPaul Mullowney 
969087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
970087f3262SPaul Mullowney {
971087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
972087f3262SPaul Mullowney   IS             ip = b->row;
973087f3262SPaul Mullowney   PetscBool      perm_identity;
974b175d8bbSPaul Mullowney   PetscErrorCode ierr;
975087f3262SPaul Mullowney 
976087f3262SPaul Mullowney   PetscFunctionBegin;
97757181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
978087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
979ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
980087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
981087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
982087f3262SPaul Mullowney   if (perm_identity) {
983087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
984087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9854e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9864e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
987087f3262SPaul Mullowney   } else {
988087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
989087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9904e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9914e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
992087f3262SPaul Mullowney   }
993087f3262SPaul Mullowney 
994087f3262SPaul Mullowney   /* get the triangular factors */
995087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
996087f3262SPaul Mullowney   PetscFunctionReturn(0);
997087f3262SPaul Mullowney }
9989ae82921SPaul Mullowney 
999b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1000bda325fcSPaul Mullowney {
1001bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1002aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1003aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1004da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1005da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1006bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1007aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1008aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1009aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1010aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10111b0a6780SStefano Zampini   cudaError_t                       cerr;
1012da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1013b175d8bbSPaul Mullowney 
1014bda325fcSPaul Mullowney   PetscFunctionBegin;
1015aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1016da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1017da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1018aa372e3fSPaul Mullowney 
1019aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1020aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1021aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1022aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1023aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1024aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1025aa372e3fSPaul Mullowney 
1026aa372e3fSPaul Mullowney   /* Create the matrix description */
102757d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
102857d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
102957d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
103057d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
103157d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1032aa372e3fSPaul Mullowney 
1033aa372e3fSPaul Mullowney   /* set the operation */
1034aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1035aa372e3fSPaul Mullowney 
1036aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1037aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1038afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1039afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1040aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1042afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1043afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1044aa372e3fSPaul Mullowney 
1045aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1046afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1047afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1048afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1049afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1050afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1051afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1052afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1053afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1054afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1055afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10561b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1057afb2bd1cSJunchao Zhang #endif
1058afb2bd1cSJunchao Zhang 
1059da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1060aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1061aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1062aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1063aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1064aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1065aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1066afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1067afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1068afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1069afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1070afb2bd1cSJunchao Zhang                         #else
1071afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1072afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1073afb2bd1cSJunchao Zhang                         #endif
1074afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1075da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1076da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1077aa372e3fSPaul Mullowney 
1078afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1079da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1080afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10811b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1082afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1083afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1084afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1085afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1086afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1087afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1088afb2bd1cSJunchao Zhang #endif
1089afb2bd1cSJunchao Zhang 
1090afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1091aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1092afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1093afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1094afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10951b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1096afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1097afb2bd1cSJunchao Zhang                           #endif
1098afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1099da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1100da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1101aa372e3fSPaul Mullowney 
1102da79fbbcSStefano Zampini   /* assign the pointer */
1103aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney   /*********************************************/
1106aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1107aa372e3fSPaul Mullowney   /*********************************************/
1108aa372e3fSPaul Mullowney 
1109aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1110da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1111da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1112aa372e3fSPaul Mullowney 
1113aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1114aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1115aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1116aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1117aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1118aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1119aa372e3fSPaul Mullowney 
1120aa372e3fSPaul Mullowney   /* Create the matrix description */
112157d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112257d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112357d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112457d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112557d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1126aa372e3fSPaul Mullowney 
1127aa372e3fSPaul Mullowney   /* set the operation */
1128aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1129aa372e3fSPaul Mullowney 
1130aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1131aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1132afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1133afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1134aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1136afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1137afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1138aa372e3fSPaul Mullowney 
1139aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1140afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1141afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1142afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1143afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1144afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1145afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1146afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1147afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1148afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1149afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1150afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1151afb2bd1cSJunchao Zhang #endif
1152afb2bd1cSJunchao Zhang 
1153da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1154aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1155aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1156aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1157aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1158aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1159aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1160afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1161afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1162afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1163afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1164afb2bd1cSJunchao Zhang                         #else
1165afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1167afb2bd1cSJunchao Zhang                         #endif
1168afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1169da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1170da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1171aa372e3fSPaul Mullowney 
1172afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1173da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1174afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11751b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1176afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1177afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1178afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1179afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1180afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1181afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1182afb2bd1cSJunchao Zhang   #endif
1183afb2bd1cSJunchao Zhang 
1184afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1185aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1186afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1187afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1188afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11891b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1190afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1191afb2bd1cSJunchao Zhang                           #endif
1192afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1193da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1194da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1195aa372e3fSPaul Mullowney 
1196da79fbbcSStefano Zampini   /* assign the pointer */
1197aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1198bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1199bda325fcSPaul Mullowney }
1200bda325fcSPaul Mullowney 
1201*a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1202*a49f1ed0SStefano Zampini {
1203*a49f1ed0SStefano Zampini   __host__ __device__
1204*a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1205*a49f1ed0SStefano Zampini   {
1206*a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1207*a49f1ed0SStefano Zampini   }
1208*a49f1ed0SStefano Zampini };
1209*a49f1ed0SStefano Zampini 
1210b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEGenerateTransposeForMult(Mat A)
1211bda325fcSPaul Mullowney {
1212aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1213*a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1214bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1215bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1216aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1217b06137fdSPaul Mullowney   cudaError_t                  err;
121885ba7357SStefano Zampini   PetscErrorCode               ierr;
1219b175d8bbSPaul Mullowney 
1220bda325fcSPaul Mullowney   PetscFunctionBegin;
1221*a49f1ed0SStefano Zampini   if (!cusparsestruct->transgen || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1222*a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1223*a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1224*a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1225*a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1226*a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1227*a49f1ed0SStefano Zampini   if (cusparsestruct->transupdated) PetscFunctionReturn(0);
122885ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1229*a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1230*a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1231*a49f1ed0SStefano Zampini   }
1232*a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1233aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123457d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1235aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123657d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
123757d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1238aa372e3fSPaul Mullowney 
1239b06137fdSPaul Mullowney     /* set alpha and beta */
1240afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12417656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12427656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1243afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12447656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12457656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1246b06137fdSPaul Mullowney 
1247aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1248aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1249*a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1250554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1251554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1252aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1253a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1254aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1255aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1256a3fdcf43SKarl Rupp 
1257039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
125881902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1259afb2bd1cSJunchao Zhang 
1260afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1261afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1262afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1263afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1264afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1265afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1266afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1267afb2bd1cSJunchao Zhang      #endif
1268aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1269afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1270afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1271afb2bd1cSJunchao Zhang    #else
1272aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127351c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127451c6d536SStefano Zampini       /* First convert HYB to CSR */
1275aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1276aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1277aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1278aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1279aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1280aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1281aa372e3fSPaul Mullowney 
1282aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1283aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1284aa372e3fSPaul Mullowney                               temp->values->data().get(),
1285aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128657d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1287aa372e3fSPaul Mullowney 
1288aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1289aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1290aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1291aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1292aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1293aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1294aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1295aa372e3fSPaul Mullowney 
1296aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1297aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1298aa372e3fSPaul Mullowney                               temp->values->data().get(),
1299aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1300aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1301aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1302aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1303aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130457d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1305aa372e3fSPaul Mullowney 
1306aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1307aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
130857d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1309aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1310aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1311aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1312aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1313aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1314aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131557d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1316aa372e3fSPaul Mullowney 
1317aa372e3fSPaul Mullowney       /* assign the pointer */
1318aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
1319*a49f1ed0SStefano Zampini       cusparsestruct->transupdated = PETSC_TRUE;
1320aa372e3fSPaul Mullowney       /* delete temporaries */
1321aa372e3fSPaul Mullowney       if (tempT) {
1322aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1323aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1324aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1325aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1326087f3262SPaul Mullowney       }
1327aa372e3fSPaul Mullowney       if (temp) {
1328aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1329aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1330aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1331aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1332aa372e3fSPaul Mullowney       }
1333afb2bd1cSJunchao Zhang      #endif
1334aa372e3fSPaul Mullowney     }
1335*a49f1ed0SStefano Zampini   }
1336*a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1337*a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1338*a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1339*a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1340*a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1341*a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1342*a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1343*a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1344*a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1345*a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1346*a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1347*a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1348*a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1349*a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1350*a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1351*a49f1ed0SStefano Zampini     }
1352*a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1353*a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1354*a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1355*a49f1ed0SStefano Zampini 
1356*a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1357*a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1358*a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1359*a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1360*a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1361*a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1362*a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1363*a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1364*a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1365*a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1366*a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1367*a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1368*a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1369*a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1370*a49f1ed0SStefano Zampini      #endif
1371*a49f1ed0SStefano Zampini 
1372*a49f1ed0SStefano Zampini       stat = cusparse_csr2csc(cusparsestruct->handle,
1373*a49f1ed0SStefano Zampini                               A->rmap->n,A->cmap->n,matrix->num_entries,
1374*a49f1ed0SStefano Zampini                               csr2csc_a.data().get(),cusparsestruct->rowoffsets_gpu->data().get(),matrix->column_indices->data().get(),
1375*a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1376*a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1377*a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1378*a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
1379*a49f1ed0SStefano Zampini                               cusparsestruct->csr2cscAlg, csr2cscBuffer
1380*a49f1ed0SStefano Zampini                              #else
1381*a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1382*a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC, indexBase
1383*a49f1ed0SStefano Zampini                              #endif
1384*a49f1ed0SStefano Zampini );CHKERRCUSPARSE(stat);
1385*a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1386*a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1387*a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1388*a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1389*a49f1ed0SStefano Zampini      #endif
1390*a49f1ed0SStefano Zampini     }
1391*a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1392*a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1393*a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1394*a49f1ed0SStefano Zampini   }
139585ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1396213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1397213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1398aa372e3fSPaul Mullowney   /* assign the pointer */
1399aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1400*a49f1ed0SStefano Zampini   cusparsestruct->transupdated = PETSC_TRUE;
1401bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1402bda325fcSPaul Mullowney }
1403bda325fcSPaul Mullowney 
1404*a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14056fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1406bda325fcSPaul Mullowney {
1407c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1408465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1409465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1410465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1411465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1412bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1413bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1414aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1415aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1416aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1417b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
141857d48284SJunchao Zhang   cudaError_t                           cerr;
1419bda325fcSPaul Mullowney 
1420bda325fcSPaul Mullowney   PetscFunctionBegin;
1421aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1422aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1423bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1424aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1425aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1426bda325fcSPaul Mullowney   }
1427bda325fcSPaul Mullowney 
1428bda325fcSPaul Mullowney   /* Get the GPU pointers */
1429c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1430c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1431c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1432c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1433bda325fcSPaul Mullowney 
14347a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1435aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1436c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1437c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1438c41cb2e2SAlejandro Lamas Daviña                xGPU);
1439aa372e3fSPaul Mullowney 
1440aa372e3fSPaul Mullowney   /* First, solve U */
1441aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1442afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14431b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1445afb2bd1cSJunchao Zhang                       #endif
1446afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1447aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1448aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1449aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1450aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1451afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1453afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1454afb2bd1cSJunchao Zhang                       #endif
1455afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1456aa372e3fSPaul Mullowney 
1457aa372e3fSPaul Mullowney   /* Then, solve L */
1458aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1459afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14601b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1461afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1462afb2bd1cSJunchao Zhang                       #endif
1463afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1464aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1465aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1466aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1467aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1468afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14691b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1470afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1471afb2bd1cSJunchao Zhang                       #endif
1472afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1473aa372e3fSPaul Mullowney 
1474aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1475c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1476c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1477aa372e3fSPaul Mullowney                tempGPU->begin());
1478aa372e3fSPaul Mullowney 
1479aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1480c41cb2e2SAlejandro Lamas Daviña   thrust::copy(tempGPU->begin(), tempGPU->end(), xGPU);
1481bda325fcSPaul Mullowney 
1482bda325fcSPaul Mullowney   /* restore */
1483c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1484c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
148505035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1486661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1487958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1488bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1489bda325fcSPaul Mullowney }
1490bda325fcSPaul Mullowney 
14916fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1492bda325fcSPaul Mullowney {
1493465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1494465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1495bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1496bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1497aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1498aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1499aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1500b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
150157d48284SJunchao Zhang   cudaError_t                       cerr;
1502bda325fcSPaul Mullowney 
1503bda325fcSPaul Mullowney   PetscFunctionBegin;
1504aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1505aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1506bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1507aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1508aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1509bda325fcSPaul Mullowney   }
1510bda325fcSPaul Mullowney 
1511bda325fcSPaul Mullowney   /* Get the GPU pointers */
1512c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1513c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1514bda325fcSPaul Mullowney 
15157a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1516aa372e3fSPaul Mullowney   /* First, solve U */
1517aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1518afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15191b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1521afb2bd1cSJunchao Zhang                       #endif
1522afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1523aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1524aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1525aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1526aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1527afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15281b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1529afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1530afb2bd1cSJunchao Zhang                       #endif
1531afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1532aa372e3fSPaul Mullowney 
1533aa372e3fSPaul Mullowney   /* Then, solve L */
1534aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1535afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1538afb2bd1cSJunchao Zhang                       #endif
1539afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1540aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1541aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1542aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1543aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1544afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15451b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1546afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1547afb2bd1cSJunchao Zhang                       #endif
1548afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1549bda325fcSPaul Mullowney 
1550bda325fcSPaul Mullowney   /* restore */
1551c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1552c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
155305035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1554661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1555958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1556bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1557bda325fcSPaul Mullowney }
1558bda325fcSPaul Mullowney 
15596fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15609ae82921SPaul Mullowney {
1561465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1562465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1563465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1564465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15659ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15669ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1567aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1568aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1569aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1570b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
157157d48284SJunchao Zhang   cudaError_t                           cerr;
15729ae82921SPaul Mullowney 
15739ae82921SPaul Mullowney   PetscFunctionBegin;
1574ebc8f436SDominic Meiser 
1575e057df02SPaul Mullowney   /* Get the GPU pointers */
1576c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1577c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1578c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1579c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15809ae82921SPaul Mullowney 
15817a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1582aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1583c41cb2e2SAlejandro Lamas Daviña   thrust::copy(thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1584c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15854e4bbfaaSStefano Zampini                tempGPU->begin());
1586aa372e3fSPaul Mullowney 
1587aa372e3fSPaul Mullowney   /* Next, solve L */
1588aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1589afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15901b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1592afb2bd1cSJunchao Zhang                       #endif
1593afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1594aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1595aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1596aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1597aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1598afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15991b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1600afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1601afb2bd1cSJunchao Zhang                       #endif
1602afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1603aa372e3fSPaul Mullowney 
1604aa372e3fSPaul Mullowney   /* Then, solve U */
1605aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1606afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1609afb2bd1cSJunchao Zhang                       #endif
1610afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1611aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1612aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1613aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1614aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1615afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16161b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1617afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1618afb2bd1cSJunchao Zhang                       #endif
1619afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1620aa372e3fSPaul Mullowney 
16214e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
16224e4bbfaaSStefano Zampini   thrust::copy(thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16234e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16244e4bbfaaSStefano Zampini                xGPU);
16259ae82921SPaul Mullowney 
1626c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1627c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
162805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1629661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1630958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16319ae82921SPaul Mullowney   PetscFunctionReturn(0);
16329ae82921SPaul Mullowney }
16339ae82921SPaul Mullowney 
16346fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16359ae82921SPaul Mullowney {
1636465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1637465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16389ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16399ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1640aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1641aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1642aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1643b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
164457d48284SJunchao Zhang   cudaError_t                       cerr;
16459ae82921SPaul Mullowney 
16469ae82921SPaul Mullowney   PetscFunctionBegin;
1647e057df02SPaul Mullowney   /* Get the GPU pointers */
1648c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1649c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16509ae82921SPaul Mullowney 
16517a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1652aa372e3fSPaul Mullowney   /* First, solve L */
1653aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1654afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16551b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1656afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1657afb2bd1cSJunchao Zhang                       #endif
1658afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1659aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1660aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1661aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1662aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1663afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16641b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1665afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1666afb2bd1cSJunchao Zhang                       #endif
1667afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1668aa372e3fSPaul Mullowney 
1669aa372e3fSPaul Mullowney   /* Next, solve U */
1670aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1671afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16721b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1674afb2bd1cSJunchao Zhang                       #endif
1675afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1676aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1677aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1678aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1679aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1680afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16811b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1682afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1683afb2bd1cSJunchao Zhang                       #endif
1684afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16859ae82921SPaul Mullowney 
1686c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1687c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
168805035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1689661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1690958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16919ae82921SPaul Mullowney   PetscFunctionReturn(0);
16929ae82921SPaul Mullowney }
16939ae82921SPaul Mullowney 
16947e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16957e8381f9SStefano Zampini {
16967e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16977e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16987e8381f9SStefano Zampini   cudaError_t        cerr;
16997e8381f9SStefano Zampini   PetscErrorCode     ierr;
17007e8381f9SStefano Zampini 
17017e8381f9SStefano Zampini   PetscFunctionBegin;
17027e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17037e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17047e8381f9SStefano Zampini 
17057e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17067e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17077e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17087e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17097e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17107e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17117e8381f9SStefano Zampini   }
17127e8381f9SStefano Zampini   PetscFunctionReturn(0);
17137e8381f9SStefano Zampini }
17147e8381f9SStefano Zampini 
17157e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17167e8381f9SStefano Zampini {
17177e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17187e8381f9SStefano Zampini   PetscErrorCode ierr;
17197e8381f9SStefano Zampini 
17207e8381f9SStefano Zampini   PetscFunctionBegin;
17217e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17227e8381f9SStefano Zampini   *array = a->a;
17237e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17247e8381f9SStefano Zampini   PetscFunctionReturn(0);
17257e8381f9SStefano Zampini }
17267e8381f9SStefano Zampini 
17276fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17289ae82921SPaul Mullowney {
1729aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17307c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17319ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1732213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17339ae82921SPaul Mullowney   PetscErrorCode               ierr;
1734aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1735abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1736b06137fdSPaul Mullowney   cudaError_t                  err;
17379ae82921SPaul Mullowney 
17389ae82921SPaul Mullowney   PetscFunctionBegin;
1739fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1740c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1741*a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1742*a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1743afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
174485ba7357SStefano Zampini 
1745abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
174685ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1747afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
174805035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17494863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
175085ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1751*a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
175234d6c7a5SJose E. Roman     } else {
1753abb89eb1SStefano Zampini       PetscInt nnz;
175485ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17557c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1756*a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17577c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
175881902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1759*a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1760*a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17619ae82921SPaul Mullowney       try {
17629ae82921SPaul Mullowney         if (a->compressedrow.use) {
17639ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17649ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17659ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17669ae82921SPaul Mullowney         } else {
1767213423ffSJunchao Zhang           m    = A->rmap->n;
1768213423ffSJunchao Zhang           ii   = a->i;
1769e6e9a74fSStefano Zampini           ridx = NULL;
17709ae82921SPaul Mullowney         }
1771abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1772abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1773abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1774abb89eb1SStefano Zampini         else nnz = a->nz;
17759ae82921SPaul Mullowney 
177685ba7357SStefano Zampini         /* create cusparse matrix */
1777abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1778aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
177957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
178057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
178157d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17829ae82921SPaul Mullowney 
1783afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17847656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17857656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1786afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17877656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17887656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
178957d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1790b06137fdSPaul Mullowney 
1791aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1792aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1793aa372e3fSPaul Mullowney           /* set the matrix */
1794afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1795afb2bd1cSJunchao Zhang           mat->num_rows = m;
1796afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1797abb89eb1SStefano Zampini           mat->num_entries = nnz;
1798afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1799afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18009ae82921SPaul Mullowney 
1801abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1802abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1803aa372e3fSPaul Mullowney 
1804abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1805abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1806aa372e3fSPaul Mullowney 
1807aa372e3fSPaul Mullowney           /* assign the pointer */
1808afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1809afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1810afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1811afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1812afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1813afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1814afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1815afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1816afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1817afb2bd1cSJunchao Zhang           }
1818afb2bd1cSJunchao Zhang          #endif
1819aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1820afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1821afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1822afb2bd1cSJunchao Zhang          #else
1823afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1824afb2bd1cSJunchao Zhang           mat->num_rows = m;
1825afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1826abb89eb1SStefano Zampini           mat->num_entries = nnz;
1827afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1828afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1829aa372e3fSPaul Mullowney 
1830abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1831abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1832aa372e3fSPaul Mullowney 
1833abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1834abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1835aa372e3fSPaul Mullowney 
1836aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
183757d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1838aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1839aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1840afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1841afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1842afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1843afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
184457d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1845aa372e3fSPaul Mullowney           /* assign the pointer */
1846aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1847aa372e3fSPaul Mullowney 
1848afb2bd1cSJunchao Zhang           if (mat) {
1849afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1850afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1851afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1852afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1853087f3262SPaul Mullowney           }
1854afb2bd1cSJunchao Zhang          #endif
1855087f3262SPaul Mullowney         }
1856ca45077fSPaul Mullowney 
1857aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1858213423ffSJunchao Zhang         if (a->compressedrow.use) {
1859213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1860aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1861aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1862213423ffSJunchao Zhang           tmp = m;
1863213423ffSJunchao Zhang         } else {
1864213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1865213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1866213423ffSJunchao Zhang           tmp = 0;
1867213423ffSJunchao Zhang         }
1868213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1869aa372e3fSPaul Mullowney 
1870aa372e3fSPaul Mullowney         /* assign the pointer */
1871aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18729ae82921SPaul Mullowney       } catch(char *ex) {
18739ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18749ae82921SPaul Mullowney       }
187505035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
187685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
187734d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
187834d6c7a5SJose E. Roman     }
1879abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18809ae82921SPaul Mullowney   }
18819ae82921SPaul Mullowney   PetscFunctionReturn(0);
18829ae82921SPaul Mullowney }
18839ae82921SPaul Mullowney 
1884c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1885aa372e3fSPaul Mullowney {
1886aa372e3fSPaul Mullowney   template <typename Tuple>
1887aa372e3fSPaul Mullowney   __host__ __device__
1888aa372e3fSPaul Mullowney   void operator()(Tuple t)
1889aa372e3fSPaul Mullowney   {
1890aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1891aa372e3fSPaul Mullowney   }
1892aa372e3fSPaul Mullowney };
1893aa372e3fSPaul Mullowney 
18947e8381f9SStefano Zampini struct VecCUDAEquals
18957e8381f9SStefano Zampini {
18967e8381f9SStefano Zampini   template <typename Tuple>
18977e8381f9SStefano Zampini   __host__ __device__
18987e8381f9SStefano Zampini   void operator()(Tuple t)
18997e8381f9SStefano Zampini   {
19007e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19017e8381f9SStefano Zampini   }
19027e8381f9SStefano Zampini };
19037e8381f9SStefano Zampini 
1904e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1905e6e9a74fSStefano Zampini {
1906e6e9a74fSStefano Zampini   template <typename Tuple>
1907e6e9a74fSStefano Zampini   __host__ __device__
1908e6e9a74fSStefano Zampini   void operator()(Tuple t)
1909e6e9a74fSStefano Zampini   {
1910e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1911e6e9a74fSStefano Zampini   }
1912e6e9a74fSStefano Zampini };
1913e6e9a74fSStefano Zampini 
1914afb2bd1cSJunchao Zhang struct MatMatCusparse {
1915ccdfe979SStefano Zampini   PetscBool             cisdense;
1916ccdfe979SStefano Zampini   PetscScalar           *Bt;
1917ccdfe979SStefano Zampini   Mat                   X;
1918fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1919fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1920fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1921afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1922fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1923afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1924afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1925afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1926afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1927fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1928fcdce8c4SStefano Zampini   void                  *mmBuffer;
1929fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1930fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1931afb2bd1cSJunchao Zhang #endif
1932afb2bd1cSJunchao Zhang };
1933ccdfe979SStefano Zampini 
1934ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1935ccdfe979SStefano Zampini {
1936ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1937ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1938ccdfe979SStefano Zampini   cudaError_t      cerr;
1939fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1940fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1941fcdce8c4SStefano Zampini  #endif
1942ccdfe979SStefano Zampini 
1943ccdfe979SStefano Zampini   PetscFunctionBegin;
1944ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1945fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1946afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1947fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1948fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1949fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1950afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1951afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1952fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1953afb2bd1cSJunchao Zhang  #endif
1954ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1955ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1956ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1957ccdfe979SStefano Zampini }
1958ccdfe979SStefano Zampini 
1959ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1960ccdfe979SStefano Zampini 
1961ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1962ccdfe979SStefano Zampini {
1963ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1964ccdfe979SStefano Zampini   Mat                          A,B;
1965afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1966ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1967ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1968ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1969ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1970ccdfe979SStefano Zampini   const PetscScalar            *barray;
1971ccdfe979SStefano Zampini   PetscScalar                  *carray;
1972ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1973ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1974ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1975ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1976afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1977ccdfe979SStefano Zampini 
1978ccdfe979SStefano Zampini   PetscFunctionBegin;
1979ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1980ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1981ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1982ccdfe979SStefano Zampini   A    = product->A;
1983ccdfe979SStefano Zampini   B    = product->B;
1984ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1985ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1986ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1987ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1988ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1989ccdfe979SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1990ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
1991ccdfe979SStefano Zampini   switch (product->type) {
1992ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1993ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1994ccdfe979SStefano Zampini     mat = cusp->mat;
1995ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1996ccdfe979SStefano Zampini     m   = A->rmap->n;
1997ccdfe979SStefano Zampini     n   = B->cmap->n;
1998ccdfe979SStefano Zampini     break;
1999ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2000e6e9a74fSStefano Zampini     if (!cusp->transgen) {
2001e6e9a74fSStefano Zampini       mat = cusp->mat;
2002e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2003e6e9a74fSStefano Zampini     } else {
2004ccdfe979SStefano Zampini       ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2005ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2006ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2007e6e9a74fSStefano Zampini     }
2008ccdfe979SStefano Zampini     m = A->cmap->n;
2009ccdfe979SStefano Zampini     n = B->cmap->n;
2010ccdfe979SStefano Zampini     break;
2011ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2012ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2013ccdfe979SStefano Zampini     mat = cusp->mat;
2014ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2015ccdfe979SStefano Zampini     m   = A->rmap->n;
2016ccdfe979SStefano Zampini     n   = B->rmap->n;
2017ccdfe979SStefano Zampini     break;
2018ccdfe979SStefano Zampini   default:
2019ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2020ccdfe979SStefano Zampini   }
2021ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2022ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2023ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2024ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2025afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2026ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2027afb2bd1cSJunchao Zhang 
2028ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2029c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2030c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2031c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2032c8378d12SStefano Zampini   } else {
2033c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2034c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2035c8378d12SStefano Zampini   }
2036c8378d12SStefano Zampini 
2037c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2038afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2039afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2040fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2041afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2042fcdce8c4SStefano Zampini     size_t mmBufferSize;
2043afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2044afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2045afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2046afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2047afb2bd1cSJunchao Zhang     }
2048c8378d12SStefano Zampini 
2049afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2050afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2051afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2052afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2053afb2bd1cSJunchao Zhang     }
2054afb2bd1cSJunchao Zhang 
2055afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2056afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2057afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2058afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2059afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2060afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2061afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2062afb2bd1cSJunchao Zhang     }
2063afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2064afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2065afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2066fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2067fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2068fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2069fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2070fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2071fcdce8c4SStefano Zampini     }
2072afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2073afb2bd1cSJunchao Zhang   } else {
2074afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2075afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2076afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2077afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2078afb2bd1cSJunchao Zhang   }
2079afb2bd1cSJunchao Zhang 
2080afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2081afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2082afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2083afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2084fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2085afb2bd1cSJunchao Zhang  #else
2086afb2bd1cSJunchao Zhang   PetscInt k;
2087afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2088ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2089ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2090ccdfe979SStefano Zampini     cublasStatus_t cerr;
2091ccdfe979SStefano Zampini 
2092ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2093ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2094ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2095ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2096ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2097ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2098ccdfe979SStefano Zampini     blda = B->cmap->n;
2099afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2100afb2bd1cSJunchao Zhang   } else {
2101afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2102ccdfe979SStefano Zampini   }
2103ccdfe979SStefano Zampini 
2104afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2105ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2106afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2107ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2108ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2109ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2110ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2111ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2112afb2bd1cSJunchao Zhang  #endif
2113afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2114c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2115c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2116ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2117ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2118ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2119ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2120ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2121ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2122ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2123ccdfe979SStefano Zampini   } else {
2124ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2125ccdfe979SStefano Zampini   }
2126ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2127ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2128ccdfe979SStefano Zampini   }
2129ccdfe979SStefano Zampini   if (!biscuda) {
2130ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2131ccdfe979SStefano Zampini   }
2132ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2133ccdfe979SStefano Zampini }
2134ccdfe979SStefano Zampini 
2135ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2136ccdfe979SStefano Zampini {
2137ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2138ccdfe979SStefano Zampini   Mat                A,B;
2139ccdfe979SStefano Zampini   PetscInt           m,n;
2140ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2141ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2142ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2143ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2144ccdfe979SStefano Zampini 
2145ccdfe979SStefano Zampini   PetscFunctionBegin;
2146ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2147ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2148ccdfe979SStefano Zampini   A    = product->A;
2149ccdfe979SStefano Zampini   B    = product->B;
2150ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2151ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2152ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2153ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2154ccdfe979SStefano Zampini   switch (product->type) {
2155ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2156ccdfe979SStefano Zampini     m = A->rmap->n;
2157ccdfe979SStefano Zampini     n = B->cmap->n;
2158ccdfe979SStefano Zampini     break;
2159ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2160ccdfe979SStefano Zampini     m = A->cmap->n;
2161ccdfe979SStefano Zampini     n = B->cmap->n;
2162ccdfe979SStefano Zampini     break;
2163ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2164ccdfe979SStefano Zampini     m = A->rmap->n;
2165ccdfe979SStefano Zampini     n = B->rmap->n;
2166ccdfe979SStefano Zampini     break;
2167ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2168ccdfe979SStefano Zampini     m = B->cmap->n;
2169ccdfe979SStefano Zampini     n = B->cmap->n;
2170ccdfe979SStefano Zampini     break;
2171ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2172ccdfe979SStefano Zampini     m = B->rmap->n;
2173ccdfe979SStefano Zampini     n = B->rmap->n;
2174ccdfe979SStefano Zampini     break;
2175ccdfe979SStefano Zampini   default:
2176ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2177ccdfe979SStefano Zampini   }
2178ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2179ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2180ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2181ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2182ccdfe979SStefano Zampini 
2183ccdfe979SStefano Zampini   /* product data */
2184ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2185ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2186afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2187afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2188ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2189afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2190ccdfe979SStefano Zampini   }
2191afb2bd1cSJunchao Zhang  #endif
2192ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2193ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2194ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2195ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2196ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2197ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2198ccdfe979SStefano Zampini     } else {
2199ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2200ccdfe979SStefano Zampini     }
2201ccdfe979SStefano Zampini   }
2202ccdfe979SStefano Zampini   C->product->data    = mmdata;
2203ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2204ccdfe979SStefano Zampini 
2205ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2206ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2207ccdfe979SStefano Zampini }
2208ccdfe979SStefano Zampini 
2209fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2210ccdfe979SStefano Zampini {
2211ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2212fcdce8c4SStefano Zampini   Mat                          A,B;
2213fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2214fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2215fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2216fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2217fcdce8c4SStefano Zampini   PetscBool                    flg;
2218ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2219fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2220fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2221fcdce8c4SStefano Zampini   MatProductType               ptype;
2222fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2223fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2224fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2225fcdce8c4SStefano Zampini #endif
2226ccdfe979SStefano Zampini 
2227ccdfe979SStefano Zampini   PetscFunctionBegin;
2228ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2229fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2230fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2231fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2232fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2233fcdce8c4SStefano Zampini   A = product->A;
2234fcdce8c4SStefano Zampini   B = product->B;
2235fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2236fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2237fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2238fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2239fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2240fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2241fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2242fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2243fcdce8c4SStefano Zampini     goto finalize;
2244fcdce8c4SStefano Zampini   }
2245fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2246fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2248fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2249fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2250fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2251fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2252fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2253fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2254fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2255fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2256fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2257fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2258fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2259fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2260fcdce8c4SStefano Zampini 
2261fcdce8c4SStefano Zampini   ptype = product->type;
2262fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2263fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2264fcdce8c4SStefano Zampini   switch (ptype) {
2265fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2266fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2267fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2268fcdce8c4SStefano Zampini     break;
2269fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2270fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2271fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2272fcdce8c4SStefano Zampini     break;
2273fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2274fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2275fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2276fcdce8c4SStefano Zampini     break;
2277fcdce8c4SStefano Zampini   default:
2278fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2279fcdce8c4SStefano Zampini   }
2280fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2281fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2282fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2283fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2284fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2285fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2286fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2287fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2288fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2289fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2290fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2291fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2292fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2293fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2294fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2295fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2296fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2297fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2298fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2299fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2300fcdce8c4SStefano Zampini #else
2301fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2302fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2303fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2304fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2305fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2306fcdce8c4SStefano Zampini #endif
2307fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2308fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2309fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2310fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2311fcdce8c4SStefano Zampini finalize:
2312fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2313fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2314fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2315fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2316fcdce8c4SStefano Zampini   c->reallocs         = 0;
2317fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2318fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2319fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2320fcdce8c4SStefano Zampini   C->num_ass++;
2321ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2322ccdfe979SStefano Zampini }
2323fcdce8c4SStefano Zampini 
2324fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2325fcdce8c4SStefano Zampini {
2326fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2327fcdce8c4SStefano Zampini   Mat                          A,B;
2328fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2329fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2330fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2331fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2332fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2333fcdce8c4SStefano Zampini   PetscBool                    flg;
2334fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2335fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2336fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2337fcdce8c4SStefano Zampini   MatProductType               ptype;
2338fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2339fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2340fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2341fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2342fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2343fcdce8c4SStefano Zampini   size_t                       bufSize2;
2344fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2345fcdce8c4SStefano Zampini #else
2346fcdce8c4SStefano Zampini   int                          cnz;
2347fcdce8c4SStefano Zampini #endif
2348fcdce8c4SStefano Zampini 
2349fcdce8c4SStefano Zampini   PetscFunctionBegin;
2350fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2351fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2352fcdce8c4SStefano Zampini   A    = product->A;
2353fcdce8c4SStefano Zampini   B    = product->B;
2354fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2355fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2356fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2357fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2358fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2359fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2360fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2361fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2362fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2363fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2364fcdce8c4SStefano Zampini 
2365fcdce8c4SStefano Zampini   /* product data */
2366fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2367fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2368fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2369fcdce8c4SStefano Zampini 
2370fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2371fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2372fcdce8c4SStefano Zampini   ptype = product->type;
2373fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2374fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2375fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2376fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2377fcdce8c4SStefano Zampini   switch (ptype) {
2378fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2379fcdce8c4SStefano Zampini     m = A->rmap->n;
2380fcdce8c4SStefano Zampini     n = B->cmap->n;
2381fcdce8c4SStefano Zampini     k = A->cmap->n;
2382fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2383fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2384fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2385fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2386fcdce8c4SStefano Zampini     break;
2387fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2388fcdce8c4SStefano Zampini     m = A->cmap->n;
2389fcdce8c4SStefano Zampini     n = B->cmap->n;
2390fcdce8c4SStefano Zampini     k = A->rmap->n;
2391fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
2392fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2393fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2394fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2395fcdce8c4SStefano Zampini     break;
2396fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2397fcdce8c4SStefano Zampini     m = A->rmap->n;
2398fcdce8c4SStefano Zampini     n = B->rmap->n;
2399fcdce8c4SStefano Zampini     k = A->cmap->n;
2400fcdce8c4SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
2401fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2402fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2403fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2404fcdce8c4SStefano Zampini     break;
2405fcdce8c4SStefano Zampini   default:
2406fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2407fcdce8c4SStefano Zampini   }
2408fcdce8c4SStefano Zampini 
2409fcdce8c4SStefano Zampini   /* create cusparse matrix */
2410fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2411fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2412fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2413fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2414fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2415fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2416fcdce8c4SStefano Zampini 
2417fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2418fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2419fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2420fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2421fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2422fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2423fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2424fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2425fcdce8c4SStefano Zampini   } else {
2426fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2427fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2428fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2429fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2430fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2431fcdce8c4SStefano Zampini   }
2432fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2433fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2434fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2435fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2436fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2437fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2438fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2439fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2440fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2441fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2442fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2443fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2444fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2445fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2446fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2447fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2448fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2449fcdce8c4SStefano Zampini     c->nz = 0;
2450fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2451fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2452fcdce8c4SStefano Zampini     goto finalizesym;
2453fcdce8c4SStefano Zampini   }
2454fcdce8c4SStefano Zampini 
2455fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2456fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2457fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2458fcdce8c4SStefano Zampini   if (!biscompressed) {
2459fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2460fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2461fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2462fcdce8c4SStefano Zampini #endif
2463fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2464fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2465fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2466fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2467fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2468fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2469fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2470fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2471fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2472fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2473fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2474fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2475fcdce8c4SStefano Zampini     }
2476fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2477fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2478fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2479fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2480fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2481fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2482fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2483fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2484fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2485fcdce8c4SStefano Zampini     }
2486fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2487fcdce8c4SStefano Zampini #endif
2488fcdce8c4SStefano Zampini   }
2489fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2490fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2491fcdce8c4SStefano Zampini   /* precompute flops count */
2492fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2493fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2494fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2495fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2496fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2497fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2498fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2499fcdce8c4SStefano Zampini       }
2500fcdce8c4SStefano Zampini     }
2501fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2502fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2503fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2504fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2505fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2506fcdce8c4SStefano Zampini     }
2507fcdce8c4SStefano Zampini   } else { /* TODO */
2508fcdce8c4SStefano Zampini     flops = 0.;
2509fcdce8c4SStefano Zampini   }
2510fcdce8c4SStefano Zampini 
2511fcdce8c4SStefano Zampini   mmdata->flops = flops;
2512fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2513fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2514fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2515fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2516fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2517fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2518fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2519fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2520fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2521fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2522fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2523fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2524fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2525bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2526fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2527fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2528fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2529fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2530fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2531fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2532fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2533fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2534fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2535fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2536fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2537fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2538fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2539fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2540fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2541bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2542fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2543fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2547fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2548fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2549fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
255000702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2551fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2552fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2553fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2554fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2555fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2556fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2557fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2558fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2559fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2560fcdce8c4SStefano Zampini #else
2561fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2562fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2563fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2564fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2565fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2566fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2567fcdce8c4SStefano Zampini   c->nz = cnz;
2568fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2569fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2570fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2571fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2572fcdce8c4SStefano Zampini 
2573fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2574fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2575fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2576fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2577fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2578fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2579fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2580fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2581fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2582fcdce8c4SStefano Zampini #endif
2583fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2584fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2585fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2586fcdce8c4SStefano Zampini finalizesym:
2587fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2588fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2589fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2590fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2591fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2592fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2593fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2594fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2595fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2596fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2597fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2598fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2599fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2600fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2601fcdce8c4SStefano Zampini   } else {
2602fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2603fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2604fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2605fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2606fcdce8c4SStefano Zampini   }
2607fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2608fcdce8c4SStefano Zampini     PetscInt r = 0;
2609fcdce8c4SStefano Zampini     c->i[0] = 0;
2610fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2611fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2612fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2613fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2614fcdce8c4SStefano Zampini     }
2615fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2616fcdce8c4SStefano Zampini   }
2617fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2618fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2619fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2620fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2621fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2622fcdce8c4SStefano Zampini   c->rmax = 0;
2623fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2624fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2625fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2626fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2627fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2628fcdce8c4SStefano Zampini   }
2629fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2630fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2631fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2632fcdce8c4SStefano Zampini 
2633fcdce8c4SStefano Zampini   C->nonzerostate++;
2634fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2635fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2636fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2637fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2638fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2639fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2640fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2641abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2642fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2643fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2644fcdce8c4SStefano Zampini   }
2645fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2646fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2647fcdce8c4SStefano Zampini }
2648fcdce8c4SStefano Zampini 
2649fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2650fcdce8c4SStefano Zampini 
2651fcdce8c4SStefano Zampini /* handles sparse or dense B */
2652fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2653fcdce8c4SStefano Zampini {
2654fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2655fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2656fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2657fcdce8c4SStefano Zampini 
2658fcdce8c4SStefano Zampini   PetscFunctionBegin;
2659fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2660fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2661abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2662fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2663fcdce8c4SStefano Zampini   }
2664fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2665fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2666fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2667fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2668fcdce8c4SStefano Zampini     }
2669fcdce8c4SStefano Zampini   }
2670fcdce8c4SStefano Zampini   if (isdense) {
2671ccdfe979SStefano Zampini     switch (product->type) {
2672ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2673ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2674ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2675ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2676ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2677fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2678fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2679fcdce8c4SStefano Zampini       } else {
2680fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2681fcdce8c4SStefano Zampini       }
2682fcdce8c4SStefano Zampini       break;
2683fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2684fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2685fcdce8c4SStefano Zampini       break;
2686ccdfe979SStefano Zampini     default:
2687ccdfe979SStefano Zampini       break;
2688ccdfe979SStefano Zampini     }
2689fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2690fcdce8c4SStefano Zampini     switch (product->type) {
2691fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2692fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2693fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2694fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2695fcdce8c4SStefano Zampini       break;
2696fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2697fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2698fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2699fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2700fcdce8c4SStefano Zampini       break;
2701fcdce8c4SStefano Zampini     default:
2702fcdce8c4SStefano Zampini       break;
2703fcdce8c4SStefano Zampini     }
2704fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2705fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2706fcdce8c4SStefano Zampini   }
2707ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2708ccdfe979SStefano Zampini }
2709ccdfe979SStefano Zampini 
27106fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27119ae82921SPaul Mullowney {
2712b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27139ae82921SPaul Mullowney 
27149ae82921SPaul Mullowney   PetscFunctionBegin;
2715e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2716e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2717e6e9a74fSStefano Zampini }
2718e6e9a74fSStefano Zampini 
2719e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2720e6e9a74fSStefano Zampini {
2721e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2722e6e9a74fSStefano Zampini 
2723e6e9a74fSStefano Zampini   PetscFunctionBegin;
2724e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2725e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2726e6e9a74fSStefano Zampini }
2727e6e9a74fSStefano Zampini 
2728e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2729e6e9a74fSStefano Zampini {
2730e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2731e6e9a74fSStefano Zampini 
2732e6e9a74fSStefano Zampini   PetscFunctionBegin;
2733e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2734e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2735e6e9a74fSStefano Zampini }
2736e6e9a74fSStefano Zampini 
2737e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2738e6e9a74fSStefano Zampini {
2739e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2740e6e9a74fSStefano Zampini 
2741e6e9a74fSStefano Zampini   PetscFunctionBegin;
2742e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27439ae82921SPaul Mullowney   PetscFunctionReturn(0);
27449ae82921SPaul Mullowney }
27459ae82921SPaul Mullowney 
27466fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2747ca45077fSPaul Mullowney {
2748b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2749ca45077fSPaul Mullowney 
2750ca45077fSPaul Mullowney   PetscFunctionBegin;
2751e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2752ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2753ca45077fSPaul Mullowney }
2754ca45077fSPaul Mullowney 
2755afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2756e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27579ae82921SPaul Mullowney {
27589ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2759aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27609ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2761e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2762b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
276357d48284SJunchao Zhang   cudaError_t                  cerr;
2764aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2765e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2766e6e9a74fSStefano Zampini   PetscBool                    compressed;
2767afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2768afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2769afb2bd1cSJunchao Zhang #endif
27706e111a19SKarl Rupp 
27719ae82921SPaul Mullowney   PetscFunctionBegin;
2772e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2773e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2774afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2775d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2776e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2777e6e9a74fSStefano Zampini   }
277834d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
277934d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2780e6e9a74fSStefano Zampini   if (!trans) {
27819ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2782c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2783e6e9a74fSStefano Zampini   } else {
2784e6e9a74fSStefano Zampini     if (herm || !cusparsestruct->transgen) {
2785e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2786e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2787e6e9a74fSStefano Zampini     } else {
2788afb2bd1cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);}
2789e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2790e6e9a74fSStefano Zampini     }
2791e6e9a74fSStefano Zampini   }
2792e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2793e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2794213423ffSJunchao Zhang 
2795e6e9a74fSStefano Zampini   try {
2796e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2797213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2798213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2799afb2bd1cSJunchao Zhang 
280085ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2801e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2802afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2803afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2804afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2805afb2bd1cSJunchao Zhang       */
2806e6e9a74fSStefano Zampini       xptr = xarray;
2807afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2808213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2809afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2810afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2811afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2812afb2bd1cSJunchao Zhang        */
2813afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2814afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2815afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2816afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2817afb2bd1cSJunchao Zhang       }
2818afb2bd1cSJunchao Zhang      #endif
2819e6e9a74fSStefano Zampini     } else {
2820afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2821afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2822afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2823afb2bd1cSJunchao Zhang        */
2824afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2825e6e9a74fSStefano Zampini       dptr = zarray;
2826e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2827afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2828e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2829e6e9a74fSStefano Zampini         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2830e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2831e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2832e6e9a74fSStefano Zampini       }
2833afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2834afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2835afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2836afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2837afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2838afb2bd1cSJunchao Zhang       }
2839afb2bd1cSJunchao Zhang      #endif
2840e6e9a74fSStefano Zampini     }
28419ae82921SPaul Mullowney 
2842afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2843aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2844afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2845afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2846afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2847afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2848afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2849afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2850afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2851afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2852afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2853afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2854afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2855afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2856afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2857afb2bd1cSJunchao Zhang 
2858afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2859afb2bd1cSJunchao Zhang       } else {
2860afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2861afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2862afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2863afb2bd1cSJunchao Zhang       }
2864afb2bd1cSJunchao Zhang 
2865afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2866afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2867afb2bd1cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEGenerateTransposeForMult() */
2868afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2869afb2bd1cSJunchao Zhang                                beta,
2870afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2871afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2872afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2873afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2874afb2bd1cSJunchao Zhang      #else
28757656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2876e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2877a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2878afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2879aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2880e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
288157d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2882afb2bd1cSJunchao Zhang      #endif
2883aa372e3fSPaul Mullowney     } else {
2884213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2885afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2886afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2887afb2bd1cSJunchao Zhang        #else
2888301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2889e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2890afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2891e6e9a74fSStefano Zampini                                  xptr, beta,
289257d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2893afb2bd1cSJunchao Zhang        #endif
2894a65300a6SPaul Mullowney       }
2895aa372e3fSPaul Mullowney     }
289605035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2897958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2898aa372e3fSPaul Mullowney 
2899e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2900213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2901213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2902213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2903e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2904213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29057656d835SStefano Zampini         }
2906213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2907c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29087656d835SStefano Zampini       }
29097656d835SStefano Zampini 
2910213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2911213423ffSJunchao Zhang       if (compressed) {
2912213423ffSJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2913e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2914c41cb2e2SAlejandro Lamas Daviña         thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2915e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2916c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
291705035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2918958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2919e6e9a74fSStefano Zampini       }
2920e6e9a74fSStefano Zampini     } else {
2921e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2922e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2923e6e9a74fSStefano Zampini       }
2924e6e9a74fSStefano Zampini     }
2925e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2926213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2927213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29289ae82921SPaul Mullowney   } catch(char *ex) {
29299ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29309ae82921SPaul Mullowney   }
2931e6e9a74fSStefano Zampini   if (yy) {
2932958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2933e6e9a74fSStefano Zampini   } else {
2934e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2935e6e9a74fSStefano Zampini   }
29369ae82921SPaul Mullowney   PetscFunctionReturn(0);
29379ae82921SPaul Mullowney }
29389ae82921SPaul Mullowney 
29396fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2940ca45077fSPaul Mullowney {
2941b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29426e111a19SKarl Rupp 
2943ca45077fSPaul Mullowney   PetscFunctionBegin;
2944e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2945ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2946ca45077fSPaul Mullowney }
2947ca45077fSPaul Mullowney 
29486fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29499ae82921SPaul Mullowney {
29509ae82921SPaul Mullowney   PetscErrorCode              ierr;
2951a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29529ae82921SPaul Mullowney   PetscFunctionBegin;
2953bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29543fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2955bc3f50f2SPaul Mullowney   }
29563fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29573fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2958a587d139SMark   if (d_mat) {
29593fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29603fa6b06aSMark Adams   }
29613fa6b06aSMark Adams 
29629ae82921SPaul Mullowney   PetscFunctionReturn(0);
29639ae82921SPaul Mullowney }
29649ae82921SPaul Mullowney 
29659ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2966e057df02SPaul Mullowney /*@
29679ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2968e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2969e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2970e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2971e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2972e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29739ae82921SPaul Mullowney 
2974d083f849SBarry Smith    Collective
29759ae82921SPaul Mullowney 
29769ae82921SPaul Mullowney    Input Parameters:
29779ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29789ae82921SPaul Mullowney .  m - number of rows
29799ae82921SPaul Mullowney .  n - number of columns
29809ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29819ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29820298fd71SBarry Smith          (possibly different for each row) or NULL
29839ae82921SPaul Mullowney 
29849ae82921SPaul Mullowney    Output Parameter:
29859ae82921SPaul Mullowney .  A - the matrix
29869ae82921SPaul Mullowney 
29879ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29889ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29899ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29909ae82921SPaul Mullowney 
29919ae82921SPaul Mullowney    Notes:
29929ae82921SPaul Mullowney    If nnz is given then nz is ignored
29939ae82921SPaul Mullowney 
29949ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
29959ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
29969ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
29979ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
29989ae82921SPaul Mullowney 
29999ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30000298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30019ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30029ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30039ae82921SPaul Mullowney 
30049ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30059ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30069ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30079ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30089ae82921SPaul Mullowney 
30099ae82921SPaul Mullowney    Level: intermediate
30109ae82921SPaul Mullowney 
3011e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30129ae82921SPaul Mullowney @*/
30139ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30149ae82921SPaul Mullowney {
30159ae82921SPaul Mullowney   PetscErrorCode ierr;
30169ae82921SPaul Mullowney 
30179ae82921SPaul Mullowney   PetscFunctionBegin;
30189ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30199ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30209ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30219ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30229ae82921SPaul Mullowney   PetscFunctionReturn(0);
30239ae82921SPaul Mullowney }
30249ae82921SPaul Mullowney 
30256fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30269ae82921SPaul Mullowney {
30279ae82921SPaul Mullowney   PetscErrorCode              ierr;
30283fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3029ab25e6cbSDominic Meiser 
30309ae82921SPaul Mullowney   PetscFunctionBegin;
30319ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30323fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30333fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3034470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30359ae82921SPaul Mullowney   } else {
3036470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3037aa372e3fSPaul Mullowney   }
30383fa6b06aSMark Adams   if (d_mat) {
30393fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30403fa6b06aSMark Adams     cudaError_t                err;
30413fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30423fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30433fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30443fa6b06aSMark Adams     if (a->compressedrow.use) {
30453fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30463fa6b06aSMark Adams     }
30473fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30483fa6b06aSMark Adams   }
3049c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3050ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3051ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3052ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3053fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3054ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30557e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30567e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30579ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30589ae82921SPaul Mullowney   PetscFunctionReturn(0);
30599ae82921SPaul Mullowney }
30609ae82921SPaul Mullowney 
3061ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
306295639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30639ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30649ff858a8SKarl Rupp {
30659ff858a8SKarl Rupp   PetscErrorCode ierr;
30669ff858a8SKarl Rupp 
30679ff858a8SKarl Rupp   PetscFunctionBegin;
30689ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3069ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30709ff858a8SKarl Rupp   PetscFunctionReturn(0);
30719ff858a8SKarl Rupp }
30729ff858a8SKarl Rupp 
3073039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
307495639643SRichard Tran Mills {
3075e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3076a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3077039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3078039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3079039c6fbaSStefano Zampini   PetscScalar        *ay;
3080039c6fbaSStefano Zampini   const PetscScalar  *ax;
3081039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3082039c6fbaSStefano Zampini   cudaError_t        cerr;
3083e6e9a74fSStefano Zampini 
308495639643SRichard Tran Mills   PetscFunctionBegin;
3085*a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3086*a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3087039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3088*a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3089a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3090a587d139SMark     PetscFunctionReturn(0);
309195639643SRichard Tran Mills   }
3092039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3093a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3094a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3095039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3096039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3097039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3098039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3099039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3100039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3101039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3102039c6fbaSStefano Zampini     if (eq) {
3103039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3104039c6fbaSStefano Zampini     }
3105039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3106039c6fbaSStefano Zampini   }
3107d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3108d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3109039c6fbaSStefano Zampini 
3110039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3111039c6fbaSStefano Zampini     cusparseStatus_t stat;
3112039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3113039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3114039c6fbaSStefano Zampini     size_t           bufferSize;
3115039c6fbaSStefano Zampini     void             *buffer;
3116039c6fbaSStefano Zampini #endif
3117039c6fbaSStefano Zampini 
3118039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3119039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3120039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3121039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3122039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3123039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3124039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3125039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3126039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3127039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3128039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3129039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3130039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3131039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3132039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3133039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3134039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3135039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3136039c6fbaSStefano Zampini #else
3137039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3138039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3139039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3140039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3141039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3142039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3143039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3144039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3145039c6fbaSStefano Zampini #endif
3146039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3147039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3148039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3149039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3150039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3151a587d139SMark     cublasHandle_t cublasv2handle;
3152039c6fbaSStefano Zampini     cublasStatus_t berr;
3153a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3154039c6fbaSStefano Zampini 
3155039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3156039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3157a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3158a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3159a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3160039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3161039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3162a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3163a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3164039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3165039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3166a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3167039c6fbaSStefano Zampini   } else {
3168*a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3169d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3170a587d139SMark   }
317195639643SRichard Tran Mills   PetscFunctionReturn(0);
317295639643SRichard Tran Mills }
317395639643SRichard Tran Mills 
317433c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
317533c9ba73SStefano Zampini {
317633c9ba73SStefano Zampini   PetscErrorCode ierr;
317733c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
317833c9ba73SStefano Zampini   PetscScalar    *ay;
317933c9ba73SStefano Zampini   cudaError_t    cerr;
318033c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
318133c9ba73SStefano Zampini   cublasStatus_t berr;
318233c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
318333c9ba73SStefano Zampini 
318433c9ba73SStefano Zampini   PetscFunctionBegin;
318533c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
318633c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
318733c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
318833c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
318933c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
319033c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
319133c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
319233c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
319333c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
319433c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
319533c9ba73SStefano Zampini   PetscFunctionReturn(0);
319633c9ba73SStefano Zampini }
319733c9ba73SStefano Zampini 
31983fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
31993fa6b06aSMark Adams {
32003fa6b06aSMark Adams   PetscErrorCode             ierr;
32017e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3202a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32037e8381f9SStefano Zampini 
32043fa6b06aSMark Adams   PetscFunctionBegin;
32053fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32063fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32077e8381f9SStefano Zampini     if (spptr->mat) {
32087e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32097e8381f9SStefano Zampini       if (matrix->values) {
32107e8381f9SStefano Zampini         both = PETSC_TRUE;
32117e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32127e8381f9SStefano Zampini       }
32137e8381f9SStefano Zampini     }
32147e8381f9SStefano Zampini     if (spptr->matTranspose) {
32157e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32167e8381f9SStefano Zampini       if (matrix->values) {
32177e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32187e8381f9SStefano Zampini       }
32197e8381f9SStefano Zampini     }
32203fa6b06aSMark Adams   }
3221a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3222a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3223a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32247e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3225a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32263fa6b06aSMark Adams 
32273fa6b06aSMark Adams   PetscFunctionReturn(0);
32283fa6b06aSMark Adams }
32293fa6b06aSMark Adams 
3230a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3231a587d139SMark {
3232a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3233a587d139SMark   PetscErrorCode ierr;
3234a587d139SMark 
3235a587d139SMark   PetscFunctionBegin;
3236a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3237a587d139SMark   if (flg) {
3238a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3239a587d139SMark 
324033c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3241a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3242a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3243a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3244a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3245a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3246a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3247a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3248a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3249fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3250c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3251a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3252a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3253a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3254a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3255a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3256fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3257a587d139SMark   } else {
325833c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3259a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3260a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3261a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3262a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3263a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3264a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3265a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3266a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3267fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3268c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3269a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3270a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3271a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3272a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3273a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3274fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3275a587d139SMark   }
3276a587d139SMark   A->boundtocpu = flg;
3277a587d139SMark   a->inode.use = flg;
3278a587d139SMark   PetscFunctionReturn(0);
3279a587d139SMark }
3280a587d139SMark 
328149735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32829ae82921SPaul Mullowney {
32839ae82921SPaul Mullowney   PetscErrorCode   ierr;
3284aa372e3fSPaul Mullowney   cusparseStatus_t stat;
328549735bf3SStefano Zampini   Mat              B;
32869ae82921SPaul Mullowney 
32879ae82921SPaul Mullowney   PetscFunctionBegin;
3288832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
328949735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
329049735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
329149735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
329249735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
329349735bf3SStefano Zampini   }
329449735bf3SStefano Zampini   B = *newmat;
329549735bf3SStefano Zampini 
329634136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
329734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
329834136279SStefano Zampini 
329949735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33009ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3301e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3302e6e9a74fSStefano Zampini 
3303e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3304e6e9a74fSStefano Zampini       spptr->format = MAT_CUSPARSE_CSR;
3305e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3306e6e9a74fSStefano Zampini       B->spptr = spptr;
33073fa6b06aSMark Adams       spptr->deviceMat = NULL;
33089ae82921SPaul Mullowney     } else {
3309e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3310e6e9a74fSStefano Zampini 
3311e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3312e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3313e6e9a74fSStefano Zampini       B->spptr = spptr;
33149ae82921SPaul Mullowney     }
3315e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
331649735bf3SStefano Zampini   }
3317693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33189ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
33199ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
332095639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3321693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33222205254eSKarl Rupp 
3323e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33249ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3325bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33269ae82921SPaul Mullowney   PetscFunctionReturn(0);
33279ae82921SPaul Mullowney }
33289ae82921SPaul Mullowney 
332902fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
333002fe1965SBarry Smith {
333102fe1965SBarry Smith   PetscErrorCode ierr;
333202fe1965SBarry Smith 
333302fe1965SBarry Smith   PetscFunctionBegin;
333402fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33350ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
3336afb2bd1cSJunchao Zhang   ierr = PetscObjectOptionsBegin((PetscObject)B);CHKERRQ(ierr);
3337afb2bd1cSJunchao Zhang   ierr = MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionsObject,B);CHKERRQ(ierr);
3338afb2bd1cSJunchao Zhang   ierr = PetscOptionsEnd();CHKERRQ(ierr);
333902fe1965SBarry Smith   PetscFunctionReturn(0);
334002fe1965SBarry Smith }
334102fe1965SBarry Smith 
33423ca39a21SBarry Smith /*MC
3343e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3344e057df02SPaul Mullowney 
3345e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33462692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33472692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3348e057df02SPaul Mullowney 
3349e057df02SPaul Mullowney    Options Database Keys:
3350e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3351aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3352a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3353e057df02SPaul Mullowney 
3354e057df02SPaul Mullowney   Level: beginner
3355e057df02SPaul Mullowney 
33568468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3357e057df02SPaul Mullowney M*/
33587f756511SDominic Meiser 
335942c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
336042c9c57cSBarry Smith 
33610f39cd5aSBarry Smith 
33623ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
336342c9c57cSBarry Smith {
336442c9c57cSBarry Smith   PetscErrorCode ierr;
336542c9c57cSBarry Smith 
336642c9c57cSBarry Smith   PetscFunctionBegin;
33673ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33683ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33693ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33703ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
337142c9c57cSBarry Smith   PetscFunctionReturn(0);
337242c9c57cSBarry Smith }
337329b38603SBarry Smith 
3374470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33757f756511SDominic Meiser {
3376e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33777f756511SDominic Meiser   cusparseStatus_t stat;
33787f756511SDominic Meiser 
33797f756511SDominic Meiser   PetscFunctionBegin;
33807f756511SDominic Meiser   if (*cusparsestruct) {
3381e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3382e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
33837f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
338481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
33857e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
33867e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3387*a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
33887e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3389e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
33907f756511SDominic Meiser   }
33917f756511SDominic Meiser   PetscFunctionReturn(0);
33927f756511SDominic Meiser }
33937f756511SDominic Meiser 
33947f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
33957f756511SDominic Meiser {
33967f756511SDominic Meiser   PetscFunctionBegin;
33977f756511SDominic Meiser   if (*mat) {
33987f756511SDominic Meiser     delete (*mat)->values;
33997f756511SDominic Meiser     delete (*mat)->column_indices;
34007f756511SDominic Meiser     delete (*mat)->row_offsets;
34017f756511SDominic Meiser     delete *mat;
34027f756511SDominic Meiser     *mat = 0;
34037f756511SDominic Meiser   }
34047f756511SDominic Meiser   PetscFunctionReturn(0);
34057f756511SDominic Meiser }
34067f756511SDominic Meiser 
3407470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34087f756511SDominic Meiser {
34097f756511SDominic Meiser   cusparseStatus_t stat;
34107f756511SDominic Meiser   PetscErrorCode   ierr;
34117f756511SDominic Meiser 
34127f756511SDominic Meiser   PetscFunctionBegin;
34137f756511SDominic Meiser   if (*trifactor) {
341457d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3415afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34167f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34171b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34182cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3419afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34201b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3421afb2bd1cSJunchao Zhang    #endif
3422da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34237f756511SDominic Meiser   }
34247f756511SDominic Meiser   PetscFunctionReturn(0);
34257f756511SDominic Meiser }
34267f756511SDominic Meiser 
3427470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34287f756511SDominic Meiser {
34297f756511SDominic Meiser   CsrMatrix        *mat;
34307f756511SDominic Meiser   cusparseStatus_t stat;
34317f756511SDominic Meiser   cudaError_t      err;
34327f756511SDominic Meiser 
34337f756511SDominic Meiser   PetscFunctionBegin;
34347f756511SDominic Meiser   if (*matstruct) {
34357f756511SDominic Meiser     if ((*matstruct)->mat) {
34367f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3437afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3438afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3439afb2bd1cSJunchao Zhang        #else
34407f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
344157d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3442afb2bd1cSJunchao Zhang        #endif
34437f756511SDominic Meiser       } else {
34447f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34457f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34467f756511SDominic Meiser       }
34477f756511SDominic Meiser     }
344857d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34497f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3450afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34517656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34527656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3453afb2bd1cSJunchao Zhang 
3454afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3455afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3456afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3457afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3458afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3459afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3460afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3461afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3462afb2bd1cSJunchao Zhang       }
3463afb2bd1cSJunchao Zhang     }
3464afb2bd1cSJunchao Zhang    #endif
34657f756511SDominic Meiser     delete *matstruct;
34667e8381f9SStefano Zampini     *matstruct = NULL;
34677f756511SDominic Meiser   }
34687f756511SDominic Meiser   PetscFunctionReturn(0);
34697f756511SDominic Meiser }
34707f756511SDominic Meiser 
3471ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34727f756511SDominic Meiser {
3473e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3474e6e9a74fSStefano Zampini 
34757f756511SDominic Meiser   PetscFunctionBegin;
34767f756511SDominic Meiser   if (*trifactors) {
3477e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3478e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3479e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3480e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34817f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34827f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
34837f756511SDominic Meiser     delete (*trifactors)->workVector;
34847e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
34857e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
34867e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3487ccdfe979SStefano Zampini   }
3488ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3489ccdfe979SStefano Zampini }
3490ccdfe979SStefano Zampini 
3491ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3492ccdfe979SStefano Zampini {
3493e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3494ccdfe979SStefano Zampini   cusparseHandle_t handle;
3495ccdfe979SStefano Zampini   cusparseStatus_t stat;
3496ccdfe979SStefano Zampini 
3497ccdfe979SStefano Zampini   PetscFunctionBegin;
3498ccdfe979SStefano Zampini   if (*trifactors) {
3499e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35007f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
350157d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35027f756511SDominic Meiser     }
3503e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35047f756511SDominic Meiser   }
35057f756511SDominic Meiser   PetscFunctionReturn(0);
35067f756511SDominic Meiser }
35077e8381f9SStefano Zampini 
35087e8381f9SStefano Zampini struct IJCompare
35097e8381f9SStefano Zampini {
35107e8381f9SStefano Zampini   __host__ __device__
35117e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35127e8381f9SStefano Zampini   {
35137e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35147e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35157e8381f9SStefano Zampini     return false;
35167e8381f9SStefano Zampini   }
35177e8381f9SStefano Zampini };
35187e8381f9SStefano Zampini 
35197e8381f9SStefano Zampini struct IJEqual
35207e8381f9SStefano Zampini {
35217e8381f9SStefano Zampini   __host__ __device__
35227e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35237e8381f9SStefano Zampini   {
35247e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35257e8381f9SStefano Zampini     return true;
35267e8381f9SStefano Zampini   }
35277e8381f9SStefano Zampini };
35287e8381f9SStefano Zampini 
35297e8381f9SStefano Zampini struct IJDiff
35307e8381f9SStefano Zampini {
35317e8381f9SStefano Zampini   __host__ __device__
35327e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35337e8381f9SStefano Zampini   {
35347e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35357e8381f9SStefano Zampini   }
35367e8381f9SStefano Zampini };
35377e8381f9SStefano Zampini 
35387e8381f9SStefano Zampini struct IJSum
35397e8381f9SStefano Zampini {
35407e8381f9SStefano Zampini   __host__ __device__
35417e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35427e8381f9SStefano Zampini   {
35437e8381f9SStefano Zampini     return t1||t2;
35447e8381f9SStefano Zampini   }
35457e8381f9SStefano Zampini };
35467e8381f9SStefano Zampini 
35477e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3548e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35497e8381f9SStefano Zampini {
35507e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3551fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3552bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
355308391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35547e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35557e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35567e8381f9SStefano Zampini   cudaError_t                           cerr;
35577e8381f9SStefano Zampini   PetscInt                              n;
35587e8381f9SStefano Zampini 
35597e8381f9SStefano Zampini   PetscFunctionBegin;
35607e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35617e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35627e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35637e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35647e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35657e8381f9SStefano Zampini     PetscFunctionReturn(0);
35667e8381f9SStefano Zampini   }
35677e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35687e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3569e61fc153SStefano Zampini   if (!v) {
3570e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3571e61fc153SStefano Zampini     goto finalize;
35727e8381f9SStefano Zampini   }
3573e61fc153SStefano Zampini   n = cusp->cooPerm->size();
357408391a17SStefano Zampini   if (isCudaMem(v)) {
357508391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
357608391a17SStefano Zampini   } else {
3577e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3578e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
357908391a17SStefano Zampini     d_v = cooPerm_v->data();
3580e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
358108391a17SStefano Zampini   }
3582bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3583e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
35847e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3585bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
358608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3587e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3588e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3589e61fc153SStefano Zampini       delete cooPerm_w;
35907e8381f9SStefano Zampini     } else {
359108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
35927e8381f9SStefano Zampini                                                                 matrix->values->begin()));
359308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
35947e8381f9SStefano Zampini                                                                 matrix->values->end()));
35957e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
35967e8381f9SStefano Zampini     }
35977e8381f9SStefano Zampini   } else {
3598e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
359908391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3600e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36017e8381f9SStefano Zampini     } else {
360208391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36037e8381f9SStefano Zampini                                                                 matrix->values->begin()));
360408391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36057e8381f9SStefano Zampini                                                                 matrix->values->end()));
36067e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36077e8381f9SStefano Zampini     }
36087e8381f9SStefano Zampini   }
36097e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3610bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3611e61fc153SStefano Zampini finalize:
3612e61fc153SStefano Zampini   delete cooPerm_v;
36137e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3614e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3615fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3616fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3617fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3618fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3619fcdce8c4SStefano Zampini   a->reallocs         = 0;
3620fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3621fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3622fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3623fcdce8c4SStefano Zampini   A->num_ass++;
36247e8381f9SStefano Zampini   PetscFunctionReturn(0);
36257e8381f9SStefano Zampini }
36267e8381f9SStefano Zampini 
3627*a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3628*a49f1ed0SStefano Zampini {
3629*a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3630*a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3631*a49f1ed0SStefano Zampini 
3632*a49f1ed0SStefano Zampini   PetscFunctionBegin;
3633*a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3634*a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3635*a49f1ed0SStefano Zampini   if (destroy) {
3636*a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3637*a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3638*a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3639*a49f1ed0SStefano Zampini   }
3640*a49f1ed0SStefano Zampini   cusp->transupdated = PETSC_FALSE;
3641*a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3642*a49f1ed0SStefano Zampini }
3643*a49f1ed0SStefano Zampini 
36447e8381f9SStefano Zampini #include <thrust/binary_search.h>
3645e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36467e8381f9SStefano Zampini {
36477e8381f9SStefano Zampini   PetscErrorCode     ierr;
36487e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36497e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36507e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36517e8381f9SStefano Zampini   cudaError_t        cerr;
36527e8381f9SStefano Zampini 
36537e8381f9SStefano Zampini   PetscFunctionBegin;
36547e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36557e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36567e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36577e8381f9SStefano Zampini   if (n != cooPerm_n) {
36587e8381f9SStefano Zampini     delete cusp->cooPerm;
36597e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36607e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36617e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36627e8381f9SStefano Zampini   }
36637e8381f9SStefano Zampini   if (n) {
36647e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36657e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36667e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36677e8381f9SStefano Zampini 
36687e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36697e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36707e8381f9SStefano Zampini 
36717e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36727e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36737e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36747e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36757e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36767e8381f9SStefano Zampini 
367708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36787e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36797e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36807e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
36817e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
36827e8381f9SStefano Zampini 
36837e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
36847e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
36857e8381f9SStefano Zampini       delete cusp->cooPerm_a;
36867e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
36877e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
36887e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
36897e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
36907e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
36917e8381f9SStefano Zampini       w[0] = 0;
36927e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
36937e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
36947e8381f9SStefano Zampini     }
36957e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
36967e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
36977e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
36987e8381f9SStefano Zampini                         ii.begin());
369908391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
370008391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37017e8381f9SStefano Zampini 
37027e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37037e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37047e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37057e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37067e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37077e8381f9SStefano Zampini     a->i[0] = 0;
37087e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37097e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3710fcdce8c4SStefano Zampini     a->rmax = 0;
37117e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37127e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37137e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37147e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37157e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37167e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37177e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37187e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37197e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3720fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37217e8381f9SStefano Zampini     }
3722fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37237e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37247e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3725fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37267e8381f9SStefano Zampini   } else {
37277e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37287e8381f9SStefano Zampini   }
3729e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37307e8381f9SStefano Zampini 
37317e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3732e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3733e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37347e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37357e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37367e8381f9SStefano Zampini   A->nonzerostate++;
37377e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3738*a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37397e8381f9SStefano Zampini 
37407e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37417e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37427e8381f9SStefano Zampini   PetscFunctionReturn(0);
37437e8381f9SStefano Zampini }
3744ed502f03SStefano Zampini 
3745ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3746ed502f03SStefano Zampini {
3747ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3748ed502f03SStefano Zampini   CsrMatrix          *csr;
3749ed502f03SStefano Zampini   PetscErrorCode     ierr;
3750ed502f03SStefano Zampini 
3751ed502f03SStefano Zampini   PetscFunctionBegin;
3752ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3753ed502f03SStefano Zampini   PetscValidPointer(a,2);
3754ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3755ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3756ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
375733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3758ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3759ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3760ed502f03SStefano Zampini   *a = csr->values->data().get();
3761ed502f03SStefano Zampini   PetscFunctionReturn(0);
3762ed502f03SStefano Zampini }
3763ed502f03SStefano Zampini 
3764ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3765ed502f03SStefano Zampini {
3766ed502f03SStefano Zampini   PetscFunctionBegin;
3767ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3768ed502f03SStefano Zampini   PetscValidPointer(a,2);
3769ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3770ed502f03SStefano Zampini   *a = NULL;
3771ed502f03SStefano Zampini   PetscFunctionReturn(0);
3772ed502f03SStefano Zampini }
3773ed502f03SStefano Zampini 
3774039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3775039c6fbaSStefano Zampini {
3776039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3777039c6fbaSStefano Zampini   CsrMatrix          *csr;
3778039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3779039c6fbaSStefano Zampini 
3780039c6fbaSStefano Zampini   PetscFunctionBegin;
3781039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3782039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3783039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3784039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3785039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
378633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3787039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3788039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3789039c6fbaSStefano Zampini   *a = csr->values->data().get();
3790039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3791*a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3792039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3793039c6fbaSStefano Zampini }
3794039c6fbaSStefano Zampini 
3795039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3796039c6fbaSStefano Zampini {
3797039c6fbaSStefano Zampini   PetscErrorCode ierr;
3798039c6fbaSStefano Zampini 
3799039c6fbaSStefano Zampini   PetscFunctionBegin;
3800039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3801039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3802039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3803039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3804039c6fbaSStefano Zampini   *a = NULL;
3805039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3806039c6fbaSStefano Zampini }
3807039c6fbaSStefano Zampini 
3808ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3809ed502f03SStefano Zampini {
3810ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3811ed502f03SStefano Zampini   CsrMatrix          *csr;
3812*a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3813ed502f03SStefano Zampini 
3814ed502f03SStefano Zampini   PetscFunctionBegin;
3815ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3816ed502f03SStefano Zampini   PetscValidPointer(a,2);
3817ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3818ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
381933c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3820ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3821ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3822ed502f03SStefano Zampini   *a = csr->values->data().get();
3823039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3824*a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3825ed502f03SStefano Zampini   PetscFunctionReturn(0);
3826ed502f03SStefano Zampini }
3827ed502f03SStefano Zampini 
3828ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3829ed502f03SStefano Zampini {
3830ed502f03SStefano Zampini   PetscErrorCode ierr;
3831ed502f03SStefano Zampini 
3832ed502f03SStefano Zampini   PetscFunctionBegin;
3833ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3834ed502f03SStefano Zampini   PetscValidPointer(a,2);
3835ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3836ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3837ed502f03SStefano Zampini   *a = NULL;
3838ed502f03SStefano Zampini   PetscFunctionReturn(0);
3839ed502f03SStefano Zampini }
3840ed502f03SStefano Zampini 
3841ed502f03SStefano Zampini struct IJCompare4
3842ed502f03SStefano Zampini {
3843ed502f03SStefano Zampini   __host__ __device__
38442ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3845ed502f03SStefano Zampini   {
3846ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3847ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3848ed502f03SStefano Zampini     return false;
3849ed502f03SStefano Zampini   }
3850ed502f03SStefano Zampini };
3851ed502f03SStefano Zampini 
38528909a122SStefano Zampini struct Shift
38538909a122SStefano Zampini {
3854ed502f03SStefano Zampini   int _shift;
3855ed502f03SStefano Zampini 
3856ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3857ed502f03SStefano Zampini   __host__ __device__
3858ed502f03SStefano Zampini   inline int operator() (const int &c)
3859ed502f03SStefano Zampini   {
3860ed502f03SStefano Zampini     return c + _shift;
3861ed502f03SStefano Zampini   }
3862ed502f03SStefano Zampini };
3863ed502f03SStefano Zampini 
3864ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3865ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3866ed502f03SStefano Zampini {
3867ed502f03SStefano Zampini   PetscErrorCode               ierr;
3868ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3869ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3870ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3871ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3872ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3873ed502f03SStefano Zampini   cusparseStatus_t             stat;
3874ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3875ed502f03SStefano Zampini   cudaError_t                  cerr;
3876ed502f03SStefano Zampini 
3877ed502f03SStefano Zampini   PetscFunctionBegin;
3878ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3879ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3880ed502f03SStefano Zampini   PetscValidPointer(C,4);
3881ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3882ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3883ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3884ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3885ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3886ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3887ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3888ed502f03SStefano Zampini     m     = A->rmap->n;
3889ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3890ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3891ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3892ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3893ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3894ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3895ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3896ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3897ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3898ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3899ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3900ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3901ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3902ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3903ed502f03SStefano Zampini     Ccusp->nrows    = m;
3904ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3905ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3906ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3907ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3908ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3909ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3910ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3911ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3912ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3913ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3914ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3915ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3916ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3917ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3918ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3919ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(A);CHKERRQ(ierr);
3920ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSEGenerateTransposeForMult(B);CHKERRQ(ierr);
3921ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3922ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3923ed502f03SStefano Zampini 
3924ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3925ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3926ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3927ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3928ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3929ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3930ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3931ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3932ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3933ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3934ed502f03SStefano Zampini     if (c->nz) {
39352ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39362ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39372ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39382ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39392ed87e7eSStefano Zampini 
3940ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3941ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3942ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3943ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3944ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3945ed502f03SStefano Zampini         }
39462ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39472ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3948ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3949ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3950ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3951ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3952ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3953ed502f03SStefano Zampini         }
39542ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39552ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3956ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39572ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39582ed87e7eSStefano Zampini                               Aroff->data().get(),
39592ed87e7eSStefano Zampini                               Annz,
39602ed87e7eSStefano Zampini                               m,
39612ed87e7eSStefano Zampini                               Acoo->data().get(),
39622ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3963ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39642ed87e7eSStefano Zampini                               Broff->data().get(),
3965ed502f03SStefano Zampini                               Bnnz,
3966ed502f03SStefano Zampini                               m,
39672ed87e7eSStefano Zampini                               Bcoo->data().get(),
3968ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39692ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39702ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39712ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39728909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3973ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3974ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39758909a122SStefano Zampini #else
39768909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39778909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39788909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39798909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39808909a122SStefano Zampini #endif
39812ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
39822ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
39832ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
39842ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
39852ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
39862ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3987ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
3988ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
3989ed502f03SStefano Zampini       thrust::advance(p2,Annz);
39902ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
39918909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
39928909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
39938909a122SStefano Zampini #endif
39942ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
39952ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
39962ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
39972ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
39982ed87e7eSStefano Zampini #else
39992ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40002ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40012ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40022ed87e7eSStefano Zampini #endif
4003ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40042ed87e7eSStefano Zampini                               Ccoo->data().get(),
4005ed502f03SStefano Zampini                               c->nz,
4006ed502f03SStefano Zampini                               m,
4007ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4008ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4009ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4010ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40112ed87e7eSStefano Zampini       delete wPerm;
40122ed87e7eSStefano Zampini       delete Acoo;
40132ed87e7eSStefano Zampini       delete Bcoo;
40142ed87e7eSStefano Zampini       delete Ccoo;
4015ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4016ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4017ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4018ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4019ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4020ed502f03SStefano Zampini #endif
4021ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen) { /* if A and B have the transpose, generate C transpose too */
4022ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4023ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4024ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4025ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4026ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4027ed502f03SStefano Zampini 
4028ed502f03SStefano Zampini         Ccusp->transgen = PETSC_TRUE;
4029*a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4030*a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4031ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4032ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4033ed502f03SStefano Zampini         CcsrT->num_rows = n;
4034ed502f03SStefano Zampini         CcsrT->num_cols = m;
4035ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4036ed502f03SStefano Zampini 
4037ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4038ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4039ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4040ed502f03SStefano Zampini 
4041ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4042ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4043ed502f03SStefano Zampini         if (AT) {
4044ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4045ed502f03SStefano Zampini           thrust::advance(rT,-1);
4046ed502f03SStefano Zampini         }
4047ed502f03SStefano Zampini         if (BT) {
4048ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4049ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4050ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4051ed502f03SStefano Zampini         }
4052ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4053ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4054ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4055ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4056ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4057ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4058ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4059ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4060ed502f03SStefano Zampini 
4061ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4062ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4063ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4064ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4065ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4066ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4067ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4068ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4069ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4070ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4071ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4072ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4073ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4074ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4075ed502f03SStefano Zampini #endif
4076ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4077ed502f03SStefano Zampini       }
4078ed502f03SStefano Zampini     }
4079ed502f03SStefano Zampini 
4080ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4081ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4082ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4083ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4084ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4085ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4086ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4087ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4088ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4089ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4090ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4091ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4092ed502f03SStefano Zampini     } else {
4093ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4094ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4095ed502f03SStefano Zampini     }
4096ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4097ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4098ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4099ed502f03SStefano Zampini     c->maxnz = c->nz;
4100ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4101ed502f03SStefano Zampini     c->rmax = 0;
4102ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4103ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4104ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4105ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4106ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4107ed502f03SStefano Zampini     }
4108ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4109ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4110ed502f03SStefano Zampini     (*C)->nonzerostate++;
4111ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4112ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4113ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4114ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4115ed502f03SStefano Zampini   } else {
4116ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4117ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4118ed502f03SStefano Zampini     if (c->nz) {
4119ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4120ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4121ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4122ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4123ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4124ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4125ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4126ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4127ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4128ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4129ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4130ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4131ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4132ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4133ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4134ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4135ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4136ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4137ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4138ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4139ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4140ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4141ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4142ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4143ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4144ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4145ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4146ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4147ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4148*a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4149ed502f03SStefano Zampini       if (Acusp->transgen && Bcusp->transgen && Ccusp->transgen) {
4150ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4151ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4152ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4153ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4154ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4155ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4156ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4157ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4158*a49f1ed0SStefano Zampini         Ccusp->transupdated = PETSC_TRUE;
4159ed502f03SStefano Zampini       }
4160ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4161ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4162ed502f03SStefano Zampini     }
4163ed502f03SStefano Zampini   }
4164ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4165ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4166ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4167ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4168ed502f03SStefano Zampini   PetscFunctionReturn(0);
4169ed502f03SStefano Zampini }
4170c215019aSStefano Zampini 
4171c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4172c215019aSStefano Zampini {
4173c215019aSStefano Zampini   PetscErrorCode    ierr;
4174c215019aSStefano Zampini   bool              dmem;
4175c215019aSStefano Zampini   const PetscScalar *av;
4176c215019aSStefano Zampini   cudaError_t       cerr;
4177c215019aSStefano Zampini 
4178c215019aSStefano Zampini   PetscFunctionBegin;
4179c215019aSStefano Zampini   dmem = isCudaMem(v);
4180c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4181c215019aSStefano Zampini   if (n && idx) {
4182c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4183c215019aSStefano Zampini     widx.assign(idx,idx+n);
4184c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4185c215019aSStefano Zampini 
4186c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4187c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4188c215019aSStefano Zampini     if (dmem) {
4189c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4190c215019aSStefano Zampini     } else {
4191c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4192c215019aSStefano Zampini       dv = w->data();
4193c215019aSStefano Zampini     }
4194c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4195c215019aSStefano Zampini 
4196c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4197c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4198c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4199c215019aSStefano Zampini     if (w) {
4200c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4201c215019aSStefano Zampini     }
4202c215019aSStefano Zampini     delete w;
4203c215019aSStefano Zampini   } else {
4204c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4205c215019aSStefano Zampini   }
4206c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4207c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4208c215019aSStefano Zampini   PetscFunctionReturn(0);
4209c215019aSStefano Zampini }
4210