xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 1a2c6b5c4ccd6e90882584b05035e103099f6217)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17bc3f50f2SPaul Mullowney 
18e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
19afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
20afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
21afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
22afb2bd1cSJunchao Zhang 
23afb2bd1cSJunchao Zhang   typedef enum {
24afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
25afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
26afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
27afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
28afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
29afb2bd1cSJunchao Zhang 
30afb2bd1cSJunchao Zhang   typedef enum {
31afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
32afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
33afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
35afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
36afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
37afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
43afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
44afb2bd1cSJunchao Zhang 
45afb2bd1cSJunchao Zhang   typedef enum {
46afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
47afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
48afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
49afb2bd1cSJunchao Zhang   */
50afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
51afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
53afb2bd1cSJunchao Zhang #endif
549ae82921SPaul Mullowney 
55087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
56087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
57087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
58087f3262SPaul Mullowney 
596fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
606fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
616fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
62087f3262SPaul Mullowney 
636fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
646fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
656fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
666fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
674416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
68a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
6933c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
706fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
716fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
726fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
736fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
74e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
75e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
76e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
779ae82921SPaul Mullowney 
787f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
79470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
80470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
81ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
82470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
847f756511SDominic Meiser 
8557181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
8657181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
87a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
8857181aedSStefano Zampini 
897e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
907e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
917e8381f9SStefano Zampini 
92c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
93c215019aSStefano Zampini 
94b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
95b06137fdSPaul Mullowney {
96b06137fdSPaul Mullowney   cusparseStatus_t   stat;
97b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
98b06137fdSPaul Mullowney 
99b06137fdSPaul Mullowney   PetscFunctionBegin;
100d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
101b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10257d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
103b06137fdSPaul Mullowney   PetscFunctionReturn(0);
104b06137fdSPaul Mullowney }
105b06137fdSPaul Mullowney 
106b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
107b06137fdSPaul Mullowney {
108b06137fdSPaul Mullowney   cusparseStatus_t   stat;
109b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
110b06137fdSPaul Mullowney 
111b06137fdSPaul Mullowney   PetscFunctionBegin;
112d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1136b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11416a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11557d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
11616a2e217SAlejandro Lamas Daviña     }
117b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1186b1cf21dSAlejandro Lamas Daviña   }
11957d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
120b06137fdSPaul Mullowney   PetscFunctionReturn(0);
121b06137fdSPaul Mullowney }
122b06137fdSPaul Mullowney 
123b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
124b06137fdSPaul Mullowney {
125b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1267e8381f9SStefano Zampini   PetscBool          flg;
1277e8381f9SStefano Zampini   PetscErrorCode     ierr;
128ccdfe979SStefano Zampini 
129b06137fdSPaul Mullowney   PetscFunctionBegin;
1307e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1317e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
132ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
133b06137fdSPaul Mullowney   PetscFunctionReturn(0);
134b06137fdSPaul Mullowney }
135b06137fdSPaul Mullowney 
136ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1379ae82921SPaul Mullowney {
1389ae82921SPaul Mullowney   PetscFunctionBegin;
1399ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1409ae82921SPaul Mullowney   PetscFunctionReturn(0);
1419ae82921SPaul Mullowney }
1429ae82921SPaul Mullowney 
143c708e6cdSJed Brown /*MC
144087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
145087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
146087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
147087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
148087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
149087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
150c708e6cdSJed Brown 
1519ae82921SPaul Mullowney   Level: beginner
152c708e6cdSJed Brown 
1533ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
154c708e6cdSJed Brown M*/
1559ae82921SPaul Mullowney 
15642c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1579ae82921SPaul Mullowney {
1589ae82921SPaul Mullowney   PetscErrorCode ierr;
159bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1609ae82921SPaul Mullowney 
1619ae82921SPaul Mullowney   PetscFunctionBegin;
162bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
163bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1642c7c0729SBarry Smith   (*B)->factortype = ftype;
1652c7c0729SBarry Smith   (*B)->useordering = PETSC_TRUE;
1669ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1672205254eSKarl Rupp 
168087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
16933d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1709ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1719ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
172087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
173087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
174087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
1759ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
176bc3f50f2SPaul Mullowney 
177fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
1783ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1799ae82921SPaul Mullowney   PetscFunctionReturn(0);
1809ae82921SPaul Mullowney }
1819ae82921SPaul Mullowney 
182bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
183ca45077fSPaul Mullowney {
184aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1856e111a19SKarl Rupp 
186ca45077fSPaul Mullowney   PetscFunctionBegin;
187ca45077fSPaul Mullowney   switch (op) {
188e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
189aa372e3fSPaul Mullowney     cusparsestruct->format = format;
190ca45077fSPaul Mullowney     break;
191e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
192aa372e3fSPaul Mullowney     cusparsestruct->format = format;
193ca45077fSPaul Mullowney     break;
194ca45077fSPaul Mullowney   default:
19536d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
196ca45077fSPaul Mullowney   }
197ca45077fSPaul Mullowney   PetscFunctionReturn(0);
198ca45077fSPaul Mullowney }
1999ae82921SPaul Mullowney 
200e057df02SPaul Mullowney /*@
201e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
202e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
203aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
204e057df02SPaul Mullowney    Not Collective
205e057df02SPaul Mullowney 
206e057df02SPaul Mullowney    Input Parameters:
2078468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
20836d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2092692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
210e057df02SPaul Mullowney 
211e057df02SPaul Mullowney    Output Parameter:
212e057df02SPaul Mullowney 
213e057df02SPaul Mullowney    Level: intermediate
214e057df02SPaul Mullowney 
2158468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
216e057df02SPaul Mullowney @*/
217e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
218e057df02SPaul Mullowney {
219e057df02SPaul Mullowney   PetscErrorCode ierr;
2206e111a19SKarl Rupp 
221e057df02SPaul Mullowney   PetscFunctionBegin;
222e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
223e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
224e057df02SPaul Mullowney   PetscFunctionReturn(0);
225e057df02SPaul Mullowney }
226e057df02SPaul Mullowney 
227*1a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
228e6e9a74fSStefano Zampini {
229e6e9a74fSStefano Zampini   PetscErrorCode ierr;
230e6e9a74fSStefano Zampini 
231e6e9a74fSStefano Zampini   PetscFunctionBegin;
232*1a2c6b5cSJunchao Zhang   switch (op) {
233*1a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
234*1a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
235*1a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
236*1a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
237*1a2c6b5cSJunchao Zhang       break;
238*1a2c6b5cSJunchao Zhang     default:
239*1a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
240*1a2c6b5cSJunchao Zhang       break;
241e6e9a74fSStefano Zampini   }
242e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
243e6e9a74fSStefano Zampini }
244e6e9a74fSStefano Zampini 
2454416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2469ae82921SPaul Mullowney {
2479ae82921SPaul Mullowney   PetscErrorCode           ierr;
248e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2499ae82921SPaul Mullowney   PetscBool                flg;
250a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2516e111a19SKarl Rupp 
2529ae82921SPaul Mullowney   PetscFunctionBegin;
253e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2549ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
255e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
256a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
257afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
258afb2bd1cSJunchao Zhang 
2594c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
260a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
261afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
262afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
263afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
264afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
265afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
266afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
267afb2bd1cSJunchao Zhang 
268afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
269afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
270afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
271afb2bd1cSJunchao Zhang 
272afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
273afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
274afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
275afb2bd1cSJunchao Zhang    #endif
2764c87dfd4SPaul Mullowney   }
2770af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
2789ae82921SPaul Mullowney   PetscFunctionReturn(0);
2799ae82921SPaul Mullowney }
2809ae82921SPaul Mullowney 
2816fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2829ae82921SPaul Mullowney {
283da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2849ae82921SPaul Mullowney   PetscErrorCode               ierr;
2859ae82921SPaul Mullowney 
2869ae82921SPaul Mullowney   PetscFunctionBegin;
287da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
2889ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
2899ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
2909ae82921SPaul Mullowney   PetscFunctionReturn(0);
2919ae82921SPaul Mullowney }
2929ae82921SPaul Mullowney 
2936fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
2949ae82921SPaul Mullowney {
295da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
2969ae82921SPaul Mullowney   PetscErrorCode               ierr;
2979ae82921SPaul Mullowney 
2989ae82921SPaul Mullowney   PetscFunctionBegin;
299da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3009ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3019ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3029ae82921SPaul Mullowney   PetscFunctionReturn(0);
3039ae82921SPaul Mullowney }
3049ae82921SPaul Mullowney 
305087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
306087f3262SPaul Mullowney {
307da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
308087f3262SPaul Mullowney   PetscErrorCode               ierr;
309087f3262SPaul Mullowney 
310087f3262SPaul Mullowney   PetscFunctionBegin;
311da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
312087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
313087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
314087f3262SPaul Mullowney   PetscFunctionReturn(0);
315087f3262SPaul Mullowney }
316087f3262SPaul Mullowney 
317087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
318087f3262SPaul Mullowney {
319da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
320087f3262SPaul Mullowney   PetscErrorCode               ierr;
321087f3262SPaul Mullowney 
322087f3262SPaul Mullowney   PetscFunctionBegin;
323da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
324087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
325087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
326087f3262SPaul Mullowney   PetscFunctionReturn(0);
327087f3262SPaul Mullowney }
328087f3262SPaul Mullowney 
329087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3309ae82921SPaul Mullowney {
3319ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3329ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3339ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
334aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3359ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3369ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3379ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3389ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3399ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
340b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
34157d48284SJunchao Zhang   cudaError_t                       cerr;
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney   PetscFunctionBegin;
344cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
345c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3469ae82921SPaul Mullowney     try {
3479ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3489ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
349da79fbbcSStefano Zampini       if (!loTriFactor) {
3502cbc15d9SMark         PetscScalar                       *AALo;
3512cbc15d9SMark 
3522cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3539ae82921SPaul Mullowney 
3549ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
35557d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
35657d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3579ae82921SPaul Mullowney 
3589ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
3599ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
3609ae82921SPaul Mullowney         AiLo[n]  = nzLower;
3619ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
3629ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
3639ae82921SPaul Mullowney         v        = aa;
3649ae82921SPaul Mullowney         vi       = aj;
3659ae82921SPaul Mullowney         offset   = 1;
3669ae82921SPaul Mullowney         rowOffset= 1;
3679ae82921SPaul Mullowney         for (i=1; i<n; i++) {
3689ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
369e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
3709ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
3719ae82921SPaul Mullowney           rowOffset += nz+1;
3729ae82921SPaul Mullowney 
373580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
374580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
3759ae82921SPaul Mullowney 
3769ae82921SPaul Mullowney           offset      += nz;
3779ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
3789ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
3799ae82921SPaul Mullowney           offset      += 1;
3809ae82921SPaul Mullowney 
3819ae82921SPaul Mullowney           v  += nz;
3829ae82921SPaul Mullowney           vi += nz;
3839ae82921SPaul Mullowney         }
3842205254eSKarl Rupp 
385aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
386da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
387da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
388aa372e3fSPaul Mullowney         /* Create the matrix description */
38957d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
39057d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3911b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
392afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
393afb2bd1cSJunchao Zhang        #else
39457d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
395afb2bd1cSJunchao Zhang        #endif
39657d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
39757d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
398aa372e3fSPaul Mullowney 
399aa372e3fSPaul Mullowney         /* set the operation */
400aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
401aa372e3fSPaul Mullowney 
402aa372e3fSPaul Mullowney         /* set the matrix */
403aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
404aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
405aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
406aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
407aa372e3fSPaul Mullowney 
408aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
409aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
410aa372e3fSPaul Mullowney 
411aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
412aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
413aa372e3fSPaul Mullowney 
414aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
415aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
416aa372e3fSPaul Mullowney 
417afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
418da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
419afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4201b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
421afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
422afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
423afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
424afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
425afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
426afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
427afb2bd1cSJunchao Zhang       #endif
428afb2bd1cSJunchao Zhang 
429aa372e3fSPaul Mullowney         /* perform the solve analysis */
430aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
431aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
432aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
433afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4341b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
435afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
436afb2bd1cSJunchao Zhang                                #endif
437afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
438da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
439da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
440aa372e3fSPaul Mullowney 
441da79fbbcSStefano Zampini         /* assign the pointer */
442aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4432cbc15d9SMark         loTriFactor->AA_h = AALo;
44457d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
44557d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4464863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
447da79fbbcSStefano Zampini       } else { /* update values only */
4482cbc15d9SMark         if (!loTriFactor->AA_h) {
4492cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4502cbc15d9SMark         }
451da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4522cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
453da79fbbcSStefano Zampini         v        = aa;
454da79fbbcSStefano Zampini         vi       = aj;
455da79fbbcSStefano Zampini         offset   = 1;
456da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
457da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
4582cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
459da79fbbcSStefano Zampini           offset      += nz;
4602cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
461da79fbbcSStefano Zampini           offset      += 1;
462da79fbbcSStefano Zampini           v  += nz;
463da79fbbcSStefano Zampini         }
4642cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
465da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
466da79fbbcSStefano Zampini       }
4679ae82921SPaul Mullowney     } catch(char *ex) {
4689ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
4699ae82921SPaul Mullowney     }
4709ae82921SPaul Mullowney   }
4719ae82921SPaul Mullowney   PetscFunctionReturn(0);
4729ae82921SPaul Mullowney }
4739ae82921SPaul Mullowney 
474087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
4759ae82921SPaul Mullowney {
4769ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
4779ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
4789ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
479aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
4809ae82921SPaul Mullowney   cusparseStatus_t                  stat;
4819ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
4829ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
4839ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
4849ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
4859ae82921SPaul Mullowney   PetscErrorCode                    ierr;
48657d48284SJunchao Zhang   cudaError_t                       cerr;
4879ae82921SPaul Mullowney 
4889ae82921SPaul Mullowney   PetscFunctionBegin;
489cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
490c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
4919ae82921SPaul Mullowney     try {
4929ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
4939ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
494da79fbbcSStefano Zampini       if (!upTriFactor) {
4952cbc15d9SMark         PetscScalar *AAUp;
4962cbc15d9SMark 
4972cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
4982cbc15d9SMark 
4999ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
50057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
50157d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5029ae82921SPaul Mullowney 
5039ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5049ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5059ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5069ae82921SPaul Mullowney         offset = nzUpper;
5079ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5089ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5099ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5109ae82921SPaul Mullowney 
511e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5129ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5139ae82921SPaul Mullowney 
514e057df02SPaul Mullowney           /* decrement the offset */
5159ae82921SPaul Mullowney           offset -= (nz+1);
5169ae82921SPaul Mullowney 
517e057df02SPaul Mullowney           /* first, set the diagonal elements */
5189ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
51909f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5209ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5219ae82921SPaul Mullowney 
522580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
523580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5249ae82921SPaul Mullowney         }
5252205254eSKarl Rupp 
526aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
527da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
528da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5292205254eSKarl Rupp 
530aa372e3fSPaul Mullowney         /* Create the matrix description */
53157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
53257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5331b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
534afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
535afb2bd1cSJunchao Zhang        #else
53657d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
537afb2bd1cSJunchao Zhang        #endif
53857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
53957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
540aa372e3fSPaul Mullowney 
541aa372e3fSPaul Mullowney         /* set the operation */
542aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
543aa372e3fSPaul Mullowney 
544aa372e3fSPaul Mullowney         /* set the matrix */
545aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
546aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
547aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
548aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
549aa372e3fSPaul Mullowney 
550aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
551aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
552aa372e3fSPaul Mullowney 
553aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
554aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
555aa372e3fSPaul Mullowney 
556aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
557aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
558aa372e3fSPaul Mullowney 
559afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
560da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
561afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
5621b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
563afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
564afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
565afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
566afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
567afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
568afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
569afb2bd1cSJunchao Zhang       #endif
570afb2bd1cSJunchao Zhang 
571aa372e3fSPaul Mullowney         /* perform the solve analysis */
572aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
573aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
574aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
575afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
5761b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
577afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
578afb2bd1cSJunchao Zhang                                #endif
579afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
580da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
581da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
582aa372e3fSPaul Mullowney 
583da79fbbcSStefano Zampini         /* assign the pointer */
584aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
5852cbc15d9SMark         upTriFactor->AA_h = AAUp;
58657d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
58757d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
5884863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
589da79fbbcSStefano Zampini       } else {
5902cbc15d9SMark         if (!upTriFactor->AA_h) {
5912cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5922cbc15d9SMark         }
593da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
594da79fbbcSStefano Zampini         offset = nzUpper;
595da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
596da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
597da79fbbcSStefano Zampini 
598da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
599da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
600da79fbbcSStefano Zampini 
601da79fbbcSStefano Zampini           /* decrement the offset */
602da79fbbcSStefano Zampini           offset -= (nz+1);
603da79fbbcSStefano Zampini 
604da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6052cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6062cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
607da79fbbcSStefano Zampini         }
6082cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
609da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
610da79fbbcSStefano Zampini       }
6119ae82921SPaul Mullowney     } catch(char *ex) {
6129ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6139ae82921SPaul Mullowney     }
6149ae82921SPaul Mullowney   }
6159ae82921SPaul Mullowney   PetscFunctionReturn(0);
6169ae82921SPaul Mullowney }
6179ae82921SPaul Mullowney 
618087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6199ae82921SPaul Mullowney {
6209ae82921SPaul Mullowney   PetscErrorCode               ierr;
6219ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6229ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6239ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6249ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6259ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6269ae82921SPaul Mullowney 
6279ae82921SPaul Mullowney   PetscFunctionBegin;
628da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
629087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
630087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6312205254eSKarl Rupp 
632da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
633aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6349ae82921SPaul Mullowney 
635c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
636e057df02SPaul Mullowney   /* lower triangular indices */
6379ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
638da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
639da79fbbcSStefano Zampini     const PetscInt *r;
640da79fbbcSStefano Zampini 
641da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
642aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
643aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6449ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
645da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
646da79fbbcSStefano Zampini   }
6479ae82921SPaul Mullowney 
648e057df02SPaul Mullowney   /* upper triangular indices */
6499ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
650da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
651da79fbbcSStefano Zampini     const PetscInt *c;
652da79fbbcSStefano Zampini 
653da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
654aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
655aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6569ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
657da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
658da79fbbcSStefano Zampini   }
6599ae82921SPaul Mullowney   PetscFunctionReturn(0);
6609ae82921SPaul Mullowney }
6619ae82921SPaul Mullowney 
662087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
663087f3262SPaul Mullowney {
664087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
665087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
666aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
667aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
668087f3262SPaul Mullowney   cusparseStatus_t                  stat;
669087f3262SPaul Mullowney   PetscErrorCode                    ierr;
67057d48284SJunchao Zhang   cudaError_t                       cerr;
671087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
672087f3262SPaul Mullowney   PetscScalar                       *AAUp;
673087f3262SPaul Mullowney   PetscScalar                       *AALo;
674087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
675087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
676087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
677087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
678087f3262SPaul Mullowney 
679087f3262SPaul Mullowney   PetscFunctionBegin;
680cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
681c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
682087f3262SPaul Mullowney     try {
683da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
684da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
685da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
686087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
68757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
68857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
689087f3262SPaul Mullowney 
690087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
691087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
692087f3262SPaul Mullowney         AiUp[n]=nzUpper;
693087f3262SPaul Mullowney         offset = 0;
694087f3262SPaul Mullowney         for (i=0; i<n; i++) {
695087f3262SPaul Mullowney           /* set the pointers */
696087f3262SPaul Mullowney           v  = aa + ai[i];
697087f3262SPaul Mullowney           vj = aj + ai[i];
698087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
699087f3262SPaul Mullowney 
700087f3262SPaul Mullowney           /* first, set the diagonal elements */
701087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
70209f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
703087f3262SPaul Mullowney           AiUp[i]      = offset;
70409f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
705087f3262SPaul Mullowney 
706087f3262SPaul Mullowney           offset+=1;
707087f3262SPaul Mullowney           if (nz>0) {
708f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
709580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
710087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
711087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
712087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
713087f3262SPaul Mullowney             }
714087f3262SPaul Mullowney             offset+=nz;
715087f3262SPaul Mullowney           }
716087f3262SPaul Mullowney         }
717087f3262SPaul Mullowney 
718aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
719da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
720da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
721087f3262SPaul Mullowney 
722aa372e3fSPaul Mullowney         /* Create the matrix description */
72357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
72457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7251b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
726afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
727afb2bd1cSJunchao Zhang        #else
72857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
729afb2bd1cSJunchao Zhang        #endif
73057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
73157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
732087f3262SPaul Mullowney 
733aa372e3fSPaul Mullowney         /* set the matrix */
734aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
735aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
736aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
737aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
738aa372e3fSPaul Mullowney 
739aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
740aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
741aa372e3fSPaul Mullowney 
742aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
743aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
744aa372e3fSPaul Mullowney 
745aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
746aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
747aa372e3fSPaul Mullowney 
748afb2bd1cSJunchao Zhang         /* set the operation */
749afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
750afb2bd1cSJunchao Zhang 
751afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
752da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
753afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7541b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
755afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
756afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
757afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
758afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
759afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
760afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
761afb2bd1cSJunchao Zhang       #endif
762afb2bd1cSJunchao Zhang 
763aa372e3fSPaul Mullowney         /* perform the solve analysis */
764aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
765aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
766aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
767afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
7681b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
769afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
770afb2bd1cSJunchao Zhang                                 #endif
771afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
772da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
773da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
774aa372e3fSPaul Mullowney 
775da79fbbcSStefano Zampini         /* assign the pointer */
776aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
777aa372e3fSPaul Mullowney 
778aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
779da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
780da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
781aa372e3fSPaul Mullowney 
782aa372e3fSPaul Mullowney         /* Create the matrix description */
78357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
78457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7851b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
786afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
787afb2bd1cSJunchao Zhang        #else
78857d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
789afb2bd1cSJunchao Zhang        #endif
79057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
79157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
792aa372e3fSPaul Mullowney 
793aa372e3fSPaul Mullowney         /* set the operation */
794aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
795aa372e3fSPaul Mullowney 
796aa372e3fSPaul Mullowney         /* set the matrix */
797aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
798aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
799aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
800aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
801aa372e3fSPaul Mullowney 
802aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
803aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
804aa372e3fSPaul Mullowney 
805aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
806aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
807aa372e3fSPaul Mullowney 
808aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
809aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
810aa372e3fSPaul Mullowney 
811afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
812da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
813afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8141b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
815afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
816afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
817afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
818afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
819afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
820afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
821afb2bd1cSJunchao Zhang       #endif
822afb2bd1cSJunchao Zhang 
823aa372e3fSPaul Mullowney         /* perform the solve analysis */
824aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
825aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
826aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
827afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8281b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
829afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
830afb2bd1cSJunchao Zhang                                 #endif
831afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
832da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
833da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
834aa372e3fSPaul Mullowney 
835da79fbbcSStefano Zampini         /* assign the pointer */
836aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
837087f3262SPaul Mullowney 
838da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
83957d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
84057d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
841da79fbbcSStefano Zampini       } else {
842da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
843da79fbbcSStefano Zampini         offset = 0;
844da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
845da79fbbcSStefano Zampini           /* set the pointers */
846da79fbbcSStefano Zampini           v  = aa + ai[i];
847da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
848da79fbbcSStefano Zampini 
849da79fbbcSStefano Zampini           /* first, set the diagonal elements */
850da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
851da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
852da79fbbcSStefano Zampini 
853da79fbbcSStefano Zampini           offset+=1;
854da79fbbcSStefano Zampini           if (nz>0) {
855da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
856da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
857da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
858da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
859da79fbbcSStefano Zampini             }
860da79fbbcSStefano Zampini             offset+=nz;
861da79fbbcSStefano Zampini           }
862da79fbbcSStefano Zampini         }
863da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
864da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
865da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
866da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
867da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
868da79fbbcSStefano Zampini       }
86957d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
87057d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
871087f3262SPaul Mullowney     } catch(char *ex) {
872087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
873087f3262SPaul Mullowney     }
874087f3262SPaul Mullowney   }
875087f3262SPaul Mullowney   PetscFunctionReturn(0);
876087f3262SPaul Mullowney }
877087f3262SPaul Mullowney 
878087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
8799ae82921SPaul Mullowney {
8809ae82921SPaul Mullowney   PetscErrorCode               ierr;
881087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
882087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
883087f3262SPaul Mullowney   IS                           ip = a->row;
884087f3262SPaul Mullowney   PetscBool                    perm_identity;
885087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
886087f3262SPaul Mullowney 
887087f3262SPaul Mullowney   PetscFunctionBegin;
888da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
889087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
890da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
891aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
892aa372e3fSPaul Mullowney 
893da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
894da79fbbcSStefano Zampini 
895087f3262SPaul Mullowney   /* lower triangular indices */
896087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
897087f3262SPaul Mullowney   if (!perm_identity) {
8984e4bbfaaSStefano Zampini     IS             iip;
899da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9004e4bbfaaSStefano Zampini 
9014e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9024e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
903da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
904aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
905aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
906aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9074e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9084e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9094e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
910087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
911da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
912da79fbbcSStefano Zampini   }
913087f3262SPaul Mullowney   PetscFunctionReturn(0);
914087f3262SPaul Mullowney }
915087f3262SPaul Mullowney 
9166fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
9179ae82921SPaul Mullowney {
9189ae82921SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
9199ae82921SPaul Mullowney   IS             isrow = b->row,iscol = b->col;
9209ae82921SPaul Mullowney   PetscBool      row_identity,col_identity;
921b175d8bbSPaul Mullowney   PetscErrorCode ierr;
9229ae82921SPaul Mullowney 
9239ae82921SPaul Mullowney   PetscFunctionBegin;
92457181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
9259ae82921SPaul Mullowney   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
926ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
927e057df02SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
9289ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
9299ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
930bda325fcSPaul Mullowney   if (row_identity && col_identity) {
931bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
932bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9334e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9344e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
935bda325fcSPaul Mullowney   } else {
936bda325fcSPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
937bda325fcSPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9384e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9394e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
940bda325fcSPaul Mullowney   }
9418dc1d2a3SPaul Mullowney 
942e057df02SPaul Mullowney   /* get the triangular factors */
943087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
9449ae82921SPaul Mullowney   PetscFunctionReturn(0);
9459ae82921SPaul Mullowney }
9469ae82921SPaul Mullowney 
947087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
948087f3262SPaul Mullowney {
949087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
950087f3262SPaul Mullowney   IS             ip = b->row;
951087f3262SPaul Mullowney   PetscBool      perm_identity;
952b175d8bbSPaul Mullowney   PetscErrorCode ierr;
953087f3262SPaul Mullowney 
954087f3262SPaul Mullowney   PetscFunctionBegin;
95557181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
956087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
957ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
958087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
959087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
960087f3262SPaul Mullowney   if (perm_identity) {
961087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
962087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9634e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9644e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
965087f3262SPaul Mullowney   } else {
966087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
967087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9684e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9694e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
970087f3262SPaul Mullowney   }
971087f3262SPaul Mullowney 
972087f3262SPaul Mullowney   /* get the triangular factors */
973087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
974087f3262SPaul Mullowney   PetscFunctionReturn(0);
975087f3262SPaul Mullowney }
9769ae82921SPaul Mullowney 
977b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
978bda325fcSPaul Mullowney {
979bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
980aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
981aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
982da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
983da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
984bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
985aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
986aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
987aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
988aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
9891b0a6780SStefano Zampini   cudaError_t                       cerr;
990da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
991b175d8bbSPaul Mullowney 
992bda325fcSPaul Mullowney   PetscFunctionBegin;
993aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
994da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
995da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
996aa372e3fSPaul Mullowney 
997aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
998aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
999aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1000aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1001aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1002aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1003aa372e3fSPaul Mullowney 
1004aa372e3fSPaul Mullowney   /* Create the matrix description */
100557d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
100657d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
100757d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
100857d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
100957d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1010aa372e3fSPaul Mullowney 
1011aa372e3fSPaul Mullowney   /* set the operation */
1012aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1013aa372e3fSPaul Mullowney 
1014aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1015aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1016afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1017afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1018aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1019afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1020afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1021afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1022aa372e3fSPaul Mullowney 
1023aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1024afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1025afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1026afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1027afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1028afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1029afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1030afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1031afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1032afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1033afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10341b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1035afb2bd1cSJunchao Zhang #endif
1036afb2bd1cSJunchao Zhang 
1037da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1038aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1039aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1040aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1041aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1042aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1043aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1044afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1045afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1046afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1047afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1048afb2bd1cSJunchao Zhang                         #else
1049afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1050afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1051afb2bd1cSJunchao Zhang                         #endif
1052afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1053da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1054da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1055aa372e3fSPaul Mullowney 
1056afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1057da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1058afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10591b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1060afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1061afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1062afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1063afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1064afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1065afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1066afb2bd1cSJunchao Zhang #endif
1067afb2bd1cSJunchao Zhang 
1068afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1069aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1070afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1071afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1072afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10731b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1074afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1075afb2bd1cSJunchao Zhang                           #endif
1076afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1077da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1078da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1079aa372e3fSPaul Mullowney 
1080da79fbbcSStefano Zampini   /* assign the pointer */
1081aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1082aa372e3fSPaul Mullowney 
1083aa372e3fSPaul Mullowney   /*********************************************/
1084aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1085aa372e3fSPaul Mullowney   /*********************************************/
1086aa372e3fSPaul Mullowney 
1087aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1088da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1089da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1090aa372e3fSPaul Mullowney 
1091aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1092aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1093aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1094aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1095aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1096aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1097aa372e3fSPaul Mullowney 
1098aa372e3fSPaul Mullowney   /* Create the matrix description */
109957d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
110057d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
110157d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
110257d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
110357d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1104aa372e3fSPaul Mullowney 
1105aa372e3fSPaul Mullowney   /* set the operation */
1106aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1107aa372e3fSPaul Mullowney 
1108aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1109aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1110afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1111afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1112aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1113afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1114afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1115afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1116aa372e3fSPaul Mullowney 
1117aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1118afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1119afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1120afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1121afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1122afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1123afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1124afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1125afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1126afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1127afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1128afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1129afb2bd1cSJunchao Zhang #endif
1130afb2bd1cSJunchao Zhang 
1131da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1132aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1133aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1134aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1135aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1136aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1137aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1138afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1139afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1140afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1141afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1142afb2bd1cSJunchao Zhang                         #else
1143afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1144afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1145afb2bd1cSJunchao Zhang                         #endif
1146afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1147da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1148da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1149aa372e3fSPaul Mullowney 
1150afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1151da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1152afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11531b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1154afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1155afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1156afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1157afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1158afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1159afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1160afb2bd1cSJunchao Zhang   #endif
1161afb2bd1cSJunchao Zhang 
1162afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1163aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1164afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1165afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1166afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11671b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1168afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1169afb2bd1cSJunchao Zhang                           #endif
1170afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1171da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1172da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1173aa372e3fSPaul Mullowney 
1174da79fbbcSStefano Zampini   /* assign the pointer */
1175aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1176bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1177bda325fcSPaul Mullowney }
1178bda325fcSPaul Mullowney 
1179a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1180a49f1ed0SStefano Zampini {
1181a49f1ed0SStefano Zampini   __host__ __device__
1182a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1183a49f1ed0SStefano Zampini   {
1184a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1185a49f1ed0SStefano Zampini   }
1186a49f1ed0SStefano Zampini };
1187a49f1ed0SStefano Zampini 
1188*1a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1189bda325fcSPaul Mullowney {
1190aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1191a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1192bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1193bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1194aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1195b06137fdSPaul Mullowney   cudaError_t                  err;
119685ba7357SStefano Zampini   PetscErrorCode               ierr;
1197b175d8bbSPaul Mullowney 
1198bda325fcSPaul Mullowney   PetscFunctionBegin;
1199*1a2c6b5cSJunchao Zhang   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1200a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1201a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1202a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1203a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
1204*1a2c6b5cSJunchao Zhang   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
1205*1a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
120685ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1207a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1208a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1209a49f1ed0SStefano Zampini   }
1210a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1211aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
121257d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1213aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
121457d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
121557d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1216aa372e3fSPaul Mullowney 
1217b06137fdSPaul Mullowney     /* set alpha and beta */
1218afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12197656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12207656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1221afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12227656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12237656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1224b06137fdSPaul Mullowney 
1225aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1226aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1227a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1228554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1229554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1230aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1231a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1232aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1233aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1234a3fdcf43SKarl Rupp 
1235039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
123681902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1237afb2bd1cSJunchao Zhang 
1238afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1239afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1240afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1241afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1242afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1243afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1244afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1245afb2bd1cSJunchao Zhang      #endif
1246aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1247afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1248afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1249afb2bd1cSJunchao Zhang    #else
1250aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
125151c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
125251c6d536SStefano Zampini       /* First convert HYB to CSR */
1253aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1254aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1255aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1256aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1257aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1258aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1259aa372e3fSPaul Mullowney 
1260aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1261aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1262aa372e3fSPaul Mullowney                               temp->values->data().get(),
1263aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
126457d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1265aa372e3fSPaul Mullowney 
1266aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1267aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1268aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1269aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1270aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1271aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1272aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1273aa372e3fSPaul Mullowney 
1274aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1275aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1276aa372e3fSPaul Mullowney                               temp->values->data().get(),
1277aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1278aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1279aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1280aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1281aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
128257d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1283aa372e3fSPaul Mullowney 
1284aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1285aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
128657d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1287aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1288aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1289aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1290aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1291aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1292aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
129357d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1294aa372e3fSPaul Mullowney 
1295aa372e3fSPaul Mullowney       /* assign the pointer */
1296aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
1297*1a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1298aa372e3fSPaul Mullowney       /* delete temporaries */
1299aa372e3fSPaul Mullowney       if (tempT) {
1300aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1301aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1302aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1303aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1304087f3262SPaul Mullowney       }
1305aa372e3fSPaul Mullowney       if (temp) {
1306aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1307aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1308aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1309aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1310aa372e3fSPaul Mullowney       }
1311afb2bd1cSJunchao Zhang      #endif
1312aa372e3fSPaul Mullowney     }
1313a49f1ed0SStefano Zampini   }
1314a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1315a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1316a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1317a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1318a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1319a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1320a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1321a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1322a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1323a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1324a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1325a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1326a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1327a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1328a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1329a49f1ed0SStefano Zampini     }
1330a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1331a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1332a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1333a49f1ed0SStefano Zampini 
1334a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1335a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1336a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1337a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1338a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1339a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1340a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1341a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1342a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1343a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1344a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1345a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1346a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1347a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1348a49f1ed0SStefano Zampini      #endif
1349a49f1ed0SStefano Zampini 
1350*1a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
1351*1a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
1352*1a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
1353*1a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
1354*1a2c6b5cSJunchao Zhang 
1355*1a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
1356*1a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
1357*1a2c6b5cSJunchao Zhang         */
1358*1a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
1359*1a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
1360*1a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
1361*1a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
1362*1a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1363a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1364a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1365a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1366a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
1367*1a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1368a49f1ed0SStefano Zampini                              #else
1369a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
1370*1a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1371a49f1ed0SStefano Zampini                              #endif
1372*1a2c6b5cSJunchao Zhang       } else {
1373*1a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
1374*1a2c6b5cSJunchao Zhang       }
1375*1a2c6b5cSJunchao Zhang 
1376a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1377a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1378a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1379a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1380a49f1ed0SStefano Zampini      #endif
1381a49f1ed0SStefano Zampini     }
1382a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1383a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1384a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1385a49f1ed0SStefano Zampini   }
138685ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1387213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1388213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1389aa372e3fSPaul Mullowney   /* assign the pointer */
1390aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
1391*1a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1392bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1393bda325fcSPaul Mullowney }
1394bda325fcSPaul Mullowney 
1395a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
13966fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1397bda325fcSPaul Mullowney {
1398c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1399465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1400465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1401465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1402465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1403bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1404bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1405aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1406aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1407aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1408b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
140957d48284SJunchao Zhang   cudaError_t                           cerr;
1410bda325fcSPaul Mullowney 
1411bda325fcSPaul Mullowney   PetscFunctionBegin;
1412aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1413aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1414bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1415aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1416aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1417bda325fcSPaul Mullowney   }
1418bda325fcSPaul Mullowney 
1419bda325fcSPaul Mullowney   /* Get the GPU pointers */
1420c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1421c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1422c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1423c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1424bda325fcSPaul Mullowney 
14257a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1426aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1427a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1428c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1429c41cb2e2SAlejandro Lamas Daviña                xGPU);
1430aa372e3fSPaul Mullowney 
1431aa372e3fSPaul Mullowney   /* First, solve U */
1432aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1433afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14341b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1435afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1436afb2bd1cSJunchao Zhang                       #endif
1437afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1438aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1439aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1440aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1441aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1442afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14431b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1444afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1445afb2bd1cSJunchao Zhang                       #endif
1446afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1447aa372e3fSPaul Mullowney 
1448aa372e3fSPaul Mullowney   /* Then, solve L */
1449aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1450afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14511b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1452afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1453afb2bd1cSJunchao Zhang                       #endif
1454afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1455aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1456aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1457aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1458aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1459afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14601b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1461afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1462afb2bd1cSJunchao Zhang                       #endif
1463afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1464aa372e3fSPaul Mullowney 
1465aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1466a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1467c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1468aa372e3fSPaul Mullowney                tempGPU->begin());
1469aa372e3fSPaul Mullowney 
1470aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1471a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1472bda325fcSPaul Mullowney 
1473bda325fcSPaul Mullowney   /* restore */
1474c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1475c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
147605035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1477661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1478958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1479bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1480bda325fcSPaul Mullowney }
1481bda325fcSPaul Mullowney 
14826fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1483bda325fcSPaul Mullowney {
1484465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1485465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1486bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1487bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1488aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1489aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1490aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1491b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
149257d48284SJunchao Zhang   cudaError_t                       cerr;
1493bda325fcSPaul Mullowney 
1494bda325fcSPaul Mullowney   PetscFunctionBegin;
1495aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1496aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1497bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1498aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1499aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1500bda325fcSPaul Mullowney   }
1501bda325fcSPaul Mullowney 
1502bda325fcSPaul Mullowney   /* Get the GPU pointers */
1503c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1504c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1505bda325fcSPaul Mullowney 
15067a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1507aa372e3fSPaul Mullowney   /* First, solve U */
1508aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1509afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15101b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1511afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1512afb2bd1cSJunchao Zhang                       #endif
1513afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1514aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1515aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1516aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1517aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1518afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15191b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1520afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1521afb2bd1cSJunchao Zhang                       #endif
1522afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1523aa372e3fSPaul Mullowney 
1524aa372e3fSPaul Mullowney   /* Then, solve L */
1525aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1526afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15271b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1528afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1529afb2bd1cSJunchao Zhang                       #endif
1530afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1531aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1532aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1533aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1534aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1535afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15361b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1537afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1538afb2bd1cSJunchao Zhang                       #endif
1539afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1540bda325fcSPaul Mullowney 
1541bda325fcSPaul Mullowney   /* restore */
1542c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1543c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
154405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1545661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1546958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1547bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1548bda325fcSPaul Mullowney }
1549bda325fcSPaul Mullowney 
15506fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15519ae82921SPaul Mullowney {
1552465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1553465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1554465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1555465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15569ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15579ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1558aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1559aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1560aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1561b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
156257d48284SJunchao Zhang   cudaError_t                           cerr;
15639ae82921SPaul Mullowney 
15649ae82921SPaul Mullowney   PetscFunctionBegin;
1565ebc8f436SDominic Meiser 
1566e057df02SPaul Mullowney   /* Get the GPU pointers */
1567c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1568c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1569c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1570c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15719ae82921SPaul Mullowney 
15727a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1573aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1574a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1575c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
15764e4bbfaaSStefano Zampini                tempGPU->begin());
1577aa372e3fSPaul Mullowney 
1578aa372e3fSPaul Mullowney   /* Next, solve L */
1579aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1580afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
15811b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1582afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1583afb2bd1cSJunchao Zhang                       #endif
1584afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1585aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1586aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1587aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1588aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1589afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15901b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1591afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1592afb2bd1cSJunchao Zhang                       #endif
1593afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1594aa372e3fSPaul Mullowney 
1595aa372e3fSPaul Mullowney   /* Then, solve U */
1596aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1597afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
15981b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1599afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1600afb2bd1cSJunchao Zhang                       #endif
1601afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1602aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1603aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1604aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1605aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1606afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16071b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1608afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1609afb2bd1cSJunchao Zhang                       #endif
1610afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1611aa372e3fSPaul Mullowney 
16124e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1613a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16144e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16154e4bbfaaSStefano Zampini                xGPU);
16169ae82921SPaul Mullowney 
1617c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1618c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
161905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1620661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1621958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16229ae82921SPaul Mullowney   PetscFunctionReturn(0);
16239ae82921SPaul Mullowney }
16249ae82921SPaul Mullowney 
16256fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16269ae82921SPaul Mullowney {
1627465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1628465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16299ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16309ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1631aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1632aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1633aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1634b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
163557d48284SJunchao Zhang   cudaError_t                       cerr;
16369ae82921SPaul Mullowney 
16379ae82921SPaul Mullowney   PetscFunctionBegin;
1638e057df02SPaul Mullowney   /* Get the GPU pointers */
1639c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1640c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16419ae82921SPaul Mullowney 
16427a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1643aa372e3fSPaul Mullowney   /* First, solve L */
1644aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1645afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16461b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1647afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1648afb2bd1cSJunchao Zhang                       #endif
1649afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1650aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1651aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1652aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1653aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1654afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16551b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1656afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1657afb2bd1cSJunchao Zhang                       #endif
1658afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1659aa372e3fSPaul Mullowney 
1660aa372e3fSPaul Mullowney   /* Next, solve U */
1661aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1662afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16631b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1664afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1665afb2bd1cSJunchao Zhang                       #endif
1666afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1667aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1668aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1669aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1670aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1671afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16721b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1673afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1674afb2bd1cSJunchao Zhang                       #endif
1675afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
16769ae82921SPaul Mullowney 
1677c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1678c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
167905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1680661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1681958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16829ae82921SPaul Mullowney   PetscFunctionReturn(0);
16839ae82921SPaul Mullowney }
16849ae82921SPaul Mullowney 
16857e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
16867e8381f9SStefano Zampini {
16877e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
16887e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
16897e8381f9SStefano Zampini   cudaError_t        cerr;
16907e8381f9SStefano Zampini   PetscErrorCode     ierr;
16917e8381f9SStefano Zampini 
16927e8381f9SStefano Zampini   PetscFunctionBegin;
16937e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
16947e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
16957e8381f9SStefano Zampini 
16967e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
16977e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
16987e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
16997e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17007e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17017e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17027e8381f9SStefano Zampini   }
17037e8381f9SStefano Zampini   PetscFunctionReturn(0);
17047e8381f9SStefano Zampini }
17057e8381f9SStefano Zampini 
17067e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17077e8381f9SStefano Zampini {
17087e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17097e8381f9SStefano Zampini   PetscErrorCode ierr;
17107e8381f9SStefano Zampini 
17117e8381f9SStefano Zampini   PetscFunctionBegin;
17127e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17137e8381f9SStefano Zampini   *array = a->a;
17147e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17157e8381f9SStefano Zampini   PetscFunctionReturn(0);
17167e8381f9SStefano Zampini }
17177e8381f9SStefano Zampini 
17186fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17199ae82921SPaul Mullowney {
1720aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17217c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17229ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1723213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17249ae82921SPaul Mullowney   PetscErrorCode               ierr;
1725aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1726abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1727b06137fdSPaul Mullowney   cudaError_t                  err;
17289ae82921SPaul Mullowney 
17299ae82921SPaul Mullowney   PetscFunctionBegin;
1730fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1731c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1732a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1733a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1734afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
173585ba7357SStefano Zampini 
1736abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
173785ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1738afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
173905035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17404863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
174185ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1742a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
174334d6c7a5SJose E. Roman     } else {
1744abb89eb1SStefano Zampini       PetscInt nnz;
174585ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17467c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1747a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17487c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
174981902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1750a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1751a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17529ae82921SPaul Mullowney       try {
17539ae82921SPaul Mullowney         if (a->compressedrow.use) {
17549ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17559ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17569ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17579ae82921SPaul Mullowney         } else {
1758213423ffSJunchao Zhang           m    = A->rmap->n;
1759213423ffSJunchao Zhang           ii   = a->i;
1760e6e9a74fSStefano Zampini           ridx = NULL;
17619ae82921SPaul Mullowney         }
1762abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1763abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1764abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1765abb89eb1SStefano Zampini         else nnz = a->nz;
17669ae82921SPaul Mullowney 
176785ba7357SStefano Zampini         /* create cusparse matrix */
1768abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1769aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
177057d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
177157d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
177257d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17739ae82921SPaul Mullowney 
1774afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
17757656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
17767656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1777afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17787656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
17797656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
178057d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1781b06137fdSPaul Mullowney 
1782aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1783aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1784aa372e3fSPaul Mullowney           /* set the matrix */
1785afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1786afb2bd1cSJunchao Zhang           mat->num_rows = m;
1787afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1788abb89eb1SStefano Zampini           mat->num_entries = nnz;
1789afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1790afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
17919ae82921SPaul Mullowney 
1792abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1793abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1794aa372e3fSPaul Mullowney 
1795abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1796abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1797aa372e3fSPaul Mullowney 
1798aa372e3fSPaul Mullowney           /* assign the pointer */
1799afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1800afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1801afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1802afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1803afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1804afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1805afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1806afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1807afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1808afb2bd1cSJunchao Zhang           }
1809afb2bd1cSJunchao Zhang          #endif
1810aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1811afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1812afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1813afb2bd1cSJunchao Zhang          #else
1814afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1815afb2bd1cSJunchao Zhang           mat->num_rows = m;
1816afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1817abb89eb1SStefano Zampini           mat->num_entries = nnz;
1818afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1819afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1820aa372e3fSPaul Mullowney 
1821abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1822abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1823aa372e3fSPaul Mullowney 
1824abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1825abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1826aa372e3fSPaul Mullowney 
1827aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
182857d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1829aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1830aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1831afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1832afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1833afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1834afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
183557d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1836aa372e3fSPaul Mullowney           /* assign the pointer */
1837aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1838aa372e3fSPaul Mullowney 
1839afb2bd1cSJunchao Zhang           if (mat) {
1840afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1841afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1842afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1843afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1844087f3262SPaul Mullowney           }
1845afb2bd1cSJunchao Zhang          #endif
1846087f3262SPaul Mullowney         }
1847ca45077fSPaul Mullowney 
1848aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1849213423ffSJunchao Zhang         if (a->compressedrow.use) {
1850213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1851aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1852aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1853213423ffSJunchao Zhang           tmp = m;
1854213423ffSJunchao Zhang         } else {
1855213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1856213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1857213423ffSJunchao Zhang           tmp = 0;
1858213423ffSJunchao Zhang         }
1859213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1860aa372e3fSPaul Mullowney 
1861aa372e3fSPaul Mullowney         /* assign the pointer */
1862aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18639ae82921SPaul Mullowney       } catch(char *ex) {
18649ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18659ae82921SPaul Mullowney       }
186605035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
186785ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
186834d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
186934d6c7a5SJose E. Roman     }
1870abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18719ae82921SPaul Mullowney   }
18729ae82921SPaul Mullowney   PetscFunctionReturn(0);
18739ae82921SPaul Mullowney }
18749ae82921SPaul Mullowney 
1875c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1876aa372e3fSPaul Mullowney {
1877aa372e3fSPaul Mullowney   template <typename Tuple>
1878aa372e3fSPaul Mullowney   __host__ __device__
1879aa372e3fSPaul Mullowney   void operator()(Tuple t)
1880aa372e3fSPaul Mullowney   {
1881aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1882aa372e3fSPaul Mullowney   }
1883aa372e3fSPaul Mullowney };
1884aa372e3fSPaul Mullowney 
18857e8381f9SStefano Zampini struct VecCUDAEquals
18867e8381f9SStefano Zampini {
18877e8381f9SStefano Zampini   template <typename Tuple>
18887e8381f9SStefano Zampini   __host__ __device__
18897e8381f9SStefano Zampini   void operator()(Tuple t)
18907e8381f9SStefano Zampini   {
18917e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
18927e8381f9SStefano Zampini   }
18937e8381f9SStefano Zampini };
18947e8381f9SStefano Zampini 
1895e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1896e6e9a74fSStefano Zampini {
1897e6e9a74fSStefano Zampini   template <typename Tuple>
1898e6e9a74fSStefano Zampini   __host__ __device__
1899e6e9a74fSStefano Zampini   void operator()(Tuple t)
1900e6e9a74fSStefano Zampini   {
1901e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1902e6e9a74fSStefano Zampini   }
1903e6e9a74fSStefano Zampini };
1904e6e9a74fSStefano Zampini 
1905afb2bd1cSJunchao Zhang struct MatMatCusparse {
1906ccdfe979SStefano Zampini   PetscBool             cisdense;
1907ccdfe979SStefano Zampini   PetscScalar           *Bt;
1908ccdfe979SStefano Zampini   Mat                   X;
1909fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1910fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1911fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1912afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1913fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1914afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1915afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1916afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1917afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1918fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1919fcdce8c4SStefano Zampini   void                  *mmBuffer;
1920fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1921fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1922afb2bd1cSJunchao Zhang #endif
1923afb2bd1cSJunchao Zhang };
1924ccdfe979SStefano Zampini 
1925ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1926ccdfe979SStefano Zampini {
1927ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1928ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1929ccdfe979SStefano Zampini   cudaError_t      cerr;
1930fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1931fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1932fcdce8c4SStefano Zampini  #endif
1933ccdfe979SStefano Zampini 
1934ccdfe979SStefano Zampini   PetscFunctionBegin;
1935ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1936fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1937afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1939fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1940fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1941afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1942afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1943fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1944afb2bd1cSJunchao Zhang  #endif
1945ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1946ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1947ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1948ccdfe979SStefano Zampini }
1949ccdfe979SStefano Zampini 
1950ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1951ccdfe979SStefano Zampini 
1952ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1953ccdfe979SStefano Zampini {
1954ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1955ccdfe979SStefano Zampini   Mat                          A,B;
1956afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1957ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1958ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1959ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1960ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1961ccdfe979SStefano Zampini   const PetscScalar            *barray;
1962ccdfe979SStefano Zampini   PetscScalar                  *carray;
1963ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1964ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1965ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1966ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1967afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1968ccdfe979SStefano Zampini 
1969ccdfe979SStefano Zampini   PetscFunctionBegin;
1970ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1971ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1972ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1973ccdfe979SStefano Zampini   A    = product->A;
1974ccdfe979SStefano Zampini   B    = product->B;
1975ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1976ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
1977ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
1978ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
1979ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
1980ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1981ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
1982ccdfe979SStefano Zampini   switch (product->type) {
1983ccdfe979SStefano Zampini   case MATPRODUCT_AB:
1984ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
1985ccdfe979SStefano Zampini     mat = cusp->mat;
1986ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
1987ccdfe979SStefano Zampini     m   = A->rmap->n;
1988ccdfe979SStefano Zampini     n   = B->cmap->n;
1989ccdfe979SStefano Zampini     break;
1990ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
1991*1a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
1992e6e9a74fSStefano Zampini       mat = cusp->mat;
1993e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
1994e6e9a74fSStefano Zampini     } else {
1995*1a2c6b5cSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
1996ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
1997ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
1998e6e9a74fSStefano Zampini     }
1999ccdfe979SStefano Zampini     m = A->cmap->n;
2000ccdfe979SStefano Zampini     n = B->cmap->n;
2001ccdfe979SStefano Zampini     break;
2002ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2003ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2004ccdfe979SStefano Zampini     mat = cusp->mat;
2005ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2006ccdfe979SStefano Zampini     m   = A->rmap->n;
2007ccdfe979SStefano Zampini     n   = B->rmap->n;
2008ccdfe979SStefano Zampini     break;
2009ccdfe979SStefano Zampini   default:
2010ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2011ccdfe979SStefano Zampini   }
2012ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2013ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2014ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2015ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2016afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2017ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2018afb2bd1cSJunchao Zhang 
2019ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2020c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2021c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2022c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2023c8378d12SStefano Zampini   } else {
2024c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2025c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2026c8378d12SStefano Zampini   }
2027c8378d12SStefano Zampini 
2028c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2029afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2030afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2031fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2032afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2033fcdce8c4SStefano Zampini     size_t mmBufferSize;
2034afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2035afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2036afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2037afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2038afb2bd1cSJunchao Zhang     }
2039c8378d12SStefano Zampini 
2040afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2041afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2042afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2043afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2044afb2bd1cSJunchao Zhang     }
2045afb2bd1cSJunchao Zhang 
2046afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2047afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2048afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2049afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2050afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2051afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2052afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2053afb2bd1cSJunchao Zhang     }
2054afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2055afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2056afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2057fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2058fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2059fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2060fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2061fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2062fcdce8c4SStefano Zampini     }
2063afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2064afb2bd1cSJunchao Zhang   } else {
2065afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2066afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2067afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2068afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2069afb2bd1cSJunchao Zhang   }
2070afb2bd1cSJunchao Zhang 
2071afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2072afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2073afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2074afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2075fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2076afb2bd1cSJunchao Zhang  #else
2077afb2bd1cSJunchao Zhang   PetscInt k;
2078afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2079ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2080ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2081ccdfe979SStefano Zampini     cublasStatus_t cerr;
2082ccdfe979SStefano Zampini 
2083ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2084ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2085ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2086ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2087ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2088ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2089ccdfe979SStefano Zampini     blda = B->cmap->n;
2090afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2091afb2bd1cSJunchao Zhang   } else {
2092afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2093ccdfe979SStefano Zampini   }
2094ccdfe979SStefano Zampini 
2095afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2096ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2097afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2098ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2099ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2100ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2101ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2102ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2103afb2bd1cSJunchao Zhang  #endif
2104afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2105c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2106c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2107ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2108ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2109ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2110ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2111ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2112ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2113ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2114ccdfe979SStefano Zampini   } else {
2115ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2116ccdfe979SStefano Zampini   }
2117ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2118ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2119ccdfe979SStefano Zampini   }
2120ccdfe979SStefano Zampini   if (!biscuda) {
2121ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2122ccdfe979SStefano Zampini   }
2123ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2124ccdfe979SStefano Zampini }
2125ccdfe979SStefano Zampini 
2126ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2127ccdfe979SStefano Zampini {
2128ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2129ccdfe979SStefano Zampini   Mat                A,B;
2130ccdfe979SStefano Zampini   PetscInt           m,n;
2131ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2132ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2133ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2134ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2135ccdfe979SStefano Zampini 
2136ccdfe979SStefano Zampini   PetscFunctionBegin;
2137ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2138ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2139ccdfe979SStefano Zampini   A    = product->A;
2140ccdfe979SStefano Zampini   B    = product->B;
2141ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2142ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2143ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2144ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2145ccdfe979SStefano Zampini   switch (product->type) {
2146ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2147ccdfe979SStefano Zampini     m = A->rmap->n;
2148ccdfe979SStefano Zampini     n = B->cmap->n;
2149ccdfe979SStefano Zampini     break;
2150ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2151ccdfe979SStefano Zampini     m = A->cmap->n;
2152ccdfe979SStefano Zampini     n = B->cmap->n;
2153ccdfe979SStefano Zampini     break;
2154ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2155ccdfe979SStefano Zampini     m = A->rmap->n;
2156ccdfe979SStefano Zampini     n = B->rmap->n;
2157ccdfe979SStefano Zampini     break;
2158ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2159ccdfe979SStefano Zampini     m = B->cmap->n;
2160ccdfe979SStefano Zampini     n = B->cmap->n;
2161ccdfe979SStefano Zampini     break;
2162ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2163ccdfe979SStefano Zampini     m = B->rmap->n;
2164ccdfe979SStefano Zampini     n = B->rmap->n;
2165ccdfe979SStefano Zampini     break;
2166ccdfe979SStefano Zampini   default:
2167ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2168ccdfe979SStefano Zampini   }
2169ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2170ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2171ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2172ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2173ccdfe979SStefano Zampini 
2174ccdfe979SStefano Zampini   /* product data */
2175ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2176ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2177afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2178afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2179ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2180afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2181ccdfe979SStefano Zampini   }
2182afb2bd1cSJunchao Zhang  #endif
2183ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2184ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2185ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2186ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2187ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2188ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2189ccdfe979SStefano Zampini     } else {
2190ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2191ccdfe979SStefano Zampini     }
2192ccdfe979SStefano Zampini   }
2193ccdfe979SStefano Zampini   C->product->data    = mmdata;
2194ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2195ccdfe979SStefano Zampini 
2196ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2197ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2198ccdfe979SStefano Zampini }
2199ccdfe979SStefano Zampini 
2200fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2201ccdfe979SStefano Zampini {
2202ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2203fcdce8c4SStefano Zampini   Mat                          A,B;
2204fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2205fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2206fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2207fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2208fcdce8c4SStefano Zampini   PetscBool                    flg;
2209ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2210fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2211fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2212fcdce8c4SStefano Zampini   MatProductType               ptype;
2213fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2214fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2215fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2216fcdce8c4SStefano Zampini #endif
2217ccdfe979SStefano Zampini 
2218ccdfe979SStefano Zampini   PetscFunctionBegin;
2219ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2220fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2221fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2222fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2223fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2224fcdce8c4SStefano Zampini   A = product->A;
2225fcdce8c4SStefano Zampini   B = product->B;
2226fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2227fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2228fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2229fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2230fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2231fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2232fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2233fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2234fcdce8c4SStefano Zampini     goto finalize;
2235fcdce8c4SStefano Zampini   }
2236fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2237fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2238fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2239fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2240fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2241fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2242fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2243fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2244fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2245fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2246fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2247fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2248fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2249fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2250fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2251fcdce8c4SStefano Zampini 
2252fcdce8c4SStefano Zampini   ptype = product->type;
2253fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2254fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2255fcdce8c4SStefano Zampini   switch (ptype) {
2256fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2257fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2258fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2259fcdce8c4SStefano Zampini     break;
2260fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2261fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2262fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2263fcdce8c4SStefano Zampini     break;
2264fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2265fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2266fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2267fcdce8c4SStefano Zampini     break;
2268fcdce8c4SStefano Zampini   default:
2269fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2270fcdce8c4SStefano Zampini   }
2271fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2272fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2273fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2274fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2275fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2276fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2277fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2278fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2279fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2280fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2281fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2282fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2283fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2284fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2285fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2286fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2287fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2288fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2289fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2290fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2291fcdce8c4SStefano Zampini #else
2292fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2293fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2294fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2295fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2296fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2297fcdce8c4SStefano Zampini #endif
2298fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2299fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2300fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2301fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2302fcdce8c4SStefano Zampini finalize:
2303fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2304fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2305fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2306fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2307fcdce8c4SStefano Zampini   c->reallocs         = 0;
2308fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2309fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2310fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2311fcdce8c4SStefano Zampini   C->num_ass++;
2312ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2313ccdfe979SStefano Zampini }
2314fcdce8c4SStefano Zampini 
2315fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2316fcdce8c4SStefano Zampini {
2317fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2318fcdce8c4SStefano Zampini   Mat                          A,B;
2319fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2320fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2321fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2322fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2323fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2324fcdce8c4SStefano Zampini   PetscBool                    flg;
2325fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2326fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2327fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2328fcdce8c4SStefano Zampini   MatProductType               ptype;
2329fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2330fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2331fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2332fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2333fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2334fcdce8c4SStefano Zampini   size_t                       bufSize2;
2335fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2336fcdce8c4SStefano Zampini #else
2337fcdce8c4SStefano Zampini   int                          cnz;
2338fcdce8c4SStefano Zampini #endif
2339fcdce8c4SStefano Zampini 
2340fcdce8c4SStefano Zampini   PetscFunctionBegin;
2341fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2342fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2343fcdce8c4SStefano Zampini   A    = product->A;
2344fcdce8c4SStefano Zampini   B    = product->B;
2345fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2346fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2347fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2348fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2349fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2350fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2351fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2352fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2353fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2354fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2355fcdce8c4SStefano Zampini 
2356fcdce8c4SStefano Zampini   /* product data */
2357fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2358fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2359fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2360fcdce8c4SStefano Zampini 
2361fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2362fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2363fcdce8c4SStefano Zampini   ptype = product->type;
2364fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2365fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2366fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2367fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2368fcdce8c4SStefano Zampini   switch (ptype) {
2369fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2370fcdce8c4SStefano Zampini     m = A->rmap->n;
2371fcdce8c4SStefano Zampini     n = B->cmap->n;
2372fcdce8c4SStefano Zampini     k = A->cmap->n;
2373fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2374fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2375fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2376fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2377fcdce8c4SStefano Zampini     break;
2378fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2379fcdce8c4SStefano Zampini     m = A->cmap->n;
2380fcdce8c4SStefano Zampini     n = B->cmap->n;
2381fcdce8c4SStefano Zampini     k = A->rmap->n;
2382*1a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2383fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2384fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2385fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2386fcdce8c4SStefano Zampini     break;
2387fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2388fcdce8c4SStefano Zampini     m = A->rmap->n;
2389fcdce8c4SStefano Zampini     n = B->rmap->n;
2390fcdce8c4SStefano Zampini     k = A->cmap->n;
2391*1a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2392fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2393fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2394fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2395fcdce8c4SStefano Zampini     break;
2396fcdce8c4SStefano Zampini   default:
2397fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2398fcdce8c4SStefano Zampini   }
2399fcdce8c4SStefano Zampini 
2400fcdce8c4SStefano Zampini   /* create cusparse matrix */
2401fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2402fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2403fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2404fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2405fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2406fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2407fcdce8c4SStefano Zampini 
2408fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2409fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2410fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2411fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2412fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2413fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2414fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2415fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2416fcdce8c4SStefano Zampini   } else {
2417fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2418fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2419fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2420fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2421fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2422fcdce8c4SStefano Zampini   }
2423fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2424fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2425fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2426fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2427fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2428fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2429fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2430fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2431fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2432fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2433fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2434fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2435fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2436fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2437fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2438fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2439fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2440fcdce8c4SStefano Zampini     c->nz = 0;
2441fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2442fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2443fcdce8c4SStefano Zampini     goto finalizesym;
2444fcdce8c4SStefano Zampini   }
2445fcdce8c4SStefano Zampini 
2446fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2447fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2448fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2449fcdce8c4SStefano Zampini   if (!biscompressed) {
2450fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2451fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2452fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2453fcdce8c4SStefano Zampini #endif
2454fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2455fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2456fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2457fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2458fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2459fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2460fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2461fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2462fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2463fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2464fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2465fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2466fcdce8c4SStefano Zampini     }
2467fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2468fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2469fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2470fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2471fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2472fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2473fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2474fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2475fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2476fcdce8c4SStefano Zampini     }
2477fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2478fcdce8c4SStefano Zampini #endif
2479fcdce8c4SStefano Zampini   }
2480fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2481fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2482fcdce8c4SStefano Zampini   /* precompute flops count */
2483fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2484fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2485fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2486fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2487fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2488fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2489fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2490fcdce8c4SStefano Zampini       }
2491fcdce8c4SStefano Zampini     }
2492fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2493fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2494fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2495fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2496fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2497fcdce8c4SStefano Zampini     }
2498fcdce8c4SStefano Zampini   } else { /* TODO */
2499fcdce8c4SStefano Zampini     flops = 0.;
2500fcdce8c4SStefano Zampini   }
2501fcdce8c4SStefano Zampini 
2502fcdce8c4SStefano Zampini   mmdata->flops = flops;
2503fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2504fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2505fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2506fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2507fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2508fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2509fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2510fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2511fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2512fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2513fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2514fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2515fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2516bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2517fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2518fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2519fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2520fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2521fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2522fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2523fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2524fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2525fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2526fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2527fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2528fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2529fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2530fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2531fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2532bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2533fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2534fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2535fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2536fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2537fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2538fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2539fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2540fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
254100702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2542fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2543fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2544fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2545fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2546fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2547fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2548fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2551fcdce8c4SStefano Zampini #else
2552fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2553fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2554fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2555fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2556fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2557fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2558fcdce8c4SStefano Zampini   c->nz = cnz;
2559fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2560fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2561fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2562fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2563fcdce8c4SStefano Zampini 
2564fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2565fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2566fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2567fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2568fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2569fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2570fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2571fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2572fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2573fcdce8c4SStefano Zampini #endif
2574fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2575fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2576fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2577fcdce8c4SStefano Zampini finalizesym:
2578fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2579fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2580fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2581fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2582fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2583fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2584fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2585fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2586fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2587fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2588fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2589fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2590fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2591fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2592fcdce8c4SStefano Zampini   } else {
2593fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2594fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2595fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2596fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2597fcdce8c4SStefano Zampini   }
2598fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2599fcdce8c4SStefano Zampini     PetscInt r = 0;
2600fcdce8c4SStefano Zampini     c->i[0] = 0;
2601fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2602fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2603fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2604fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2605fcdce8c4SStefano Zampini     }
2606fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2607fcdce8c4SStefano Zampini   }
2608fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2609fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2610fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2611fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2612fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2613fcdce8c4SStefano Zampini   c->rmax = 0;
2614fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2615fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2616fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2617fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2618fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2619fcdce8c4SStefano Zampini   }
2620fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2621fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2622fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2623fcdce8c4SStefano Zampini 
2624fcdce8c4SStefano Zampini   C->nonzerostate++;
2625fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2626fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2627fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2628fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2629fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2630fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2631fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2632abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2633fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2634fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2635fcdce8c4SStefano Zampini   }
2636fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2637fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2638fcdce8c4SStefano Zampini }
2639fcdce8c4SStefano Zampini 
2640fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2641fcdce8c4SStefano Zampini 
2642fcdce8c4SStefano Zampini /* handles sparse or dense B */
2643fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2644fcdce8c4SStefano Zampini {
2645fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2646fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2647fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2648fcdce8c4SStefano Zampini 
2649fcdce8c4SStefano Zampini   PetscFunctionBegin;
2650fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2651fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2652abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2653fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2654fcdce8c4SStefano Zampini   }
2655fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2656fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2657fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2658fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2659fcdce8c4SStefano Zampini     }
2660fcdce8c4SStefano Zampini   }
2661fcdce8c4SStefano Zampini   if (isdense) {
2662ccdfe979SStefano Zampini     switch (product->type) {
2663ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2664ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2665ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2666ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2667ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2668fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2669fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2670fcdce8c4SStefano Zampini       } else {
2671fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2672fcdce8c4SStefano Zampini       }
2673fcdce8c4SStefano Zampini       break;
2674fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2675fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2676fcdce8c4SStefano Zampini       break;
2677ccdfe979SStefano Zampini     default:
2678ccdfe979SStefano Zampini       break;
2679ccdfe979SStefano Zampini     }
2680fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2681fcdce8c4SStefano Zampini     switch (product->type) {
2682fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2683fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2684fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2685fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2686fcdce8c4SStefano Zampini       break;
2687fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2688fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2689fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2690fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2691fcdce8c4SStefano Zampini       break;
2692fcdce8c4SStefano Zampini     default:
2693fcdce8c4SStefano Zampini       break;
2694fcdce8c4SStefano Zampini     }
2695fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2696fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2697fcdce8c4SStefano Zampini   }
2698ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2699ccdfe979SStefano Zampini }
2700ccdfe979SStefano Zampini 
27016fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27029ae82921SPaul Mullowney {
2703b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27049ae82921SPaul Mullowney 
27059ae82921SPaul Mullowney   PetscFunctionBegin;
2706e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2707e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2708e6e9a74fSStefano Zampini }
2709e6e9a74fSStefano Zampini 
2710e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2711e6e9a74fSStefano Zampini {
2712e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2713e6e9a74fSStefano Zampini 
2714e6e9a74fSStefano Zampini   PetscFunctionBegin;
2715e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2716e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2717e6e9a74fSStefano Zampini }
2718e6e9a74fSStefano Zampini 
2719e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2720e6e9a74fSStefano Zampini {
2721e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2722e6e9a74fSStefano Zampini 
2723e6e9a74fSStefano Zampini   PetscFunctionBegin;
2724e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2725e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2726e6e9a74fSStefano Zampini }
2727e6e9a74fSStefano Zampini 
2728e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2729e6e9a74fSStefano Zampini {
2730e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2731e6e9a74fSStefano Zampini 
2732e6e9a74fSStefano Zampini   PetscFunctionBegin;
2733e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27349ae82921SPaul Mullowney   PetscFunctionReturn(0);
27359ae82921SPaul Mullowney }
27369ae82921SPaul Mullowney 
27376fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2738ca45077fSPaul Mullowney {
2739b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2740ca45077fSPaul Mullowney 
2741ca45077fSPaul Mullowney   PetscFunctionBegin;
2742e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2743ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2744ca45077fSPaul Mullowney }
2745ca45077fSPaul Mullowney 
2746a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2747a0e72f99SJunchao Zhang {
2748a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2749a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2750a0e72f99SJunchao Zhang }
2751a0e72f99SJunchao Zhang 
2752afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2753e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27549ae82921SPaul Mullowney {
27559ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2756aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27579ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2758e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2759b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
276057d48284SJunchao Zhang   cudaError_t                  cerr;
2761aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2762e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2763e6e9a74fSStefano Zampini   PetscBool                    compressed;
2764afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2765afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2766afb2bd1cSJunchao Zhang #endif
27676e111a19SKarl Rupp 
27689ae82921SPaul Mullowney   PetscFunctionBegin;
2769e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2770e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2771afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2772d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2773e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2774e6e9a74fSStefano Zampini   }
277534d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
277634d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2777e6e9a74fSStefano Zampini   if (!trans) {
27789ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2779c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2780e6e9a74fSStefano Zampini   } else {
2781*1a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2782e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2783e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2784e6e9a74fSStefano Zampini     } else {
2785*1a2c6b5cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2786e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2787e6e9a74fSStefano Zampini     }
2788e6e9a74fSStefano Zampini   }
2789e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2790e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2791213423ffSJunchao Zhang 
2792e6e9a74fSStefano Zampini   try {
2793e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2794213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2795213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2796afb2bd1cSJunchao Zhang 
279785ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2798e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2799afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2800afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2801afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2802afb2bd1cSJunchao Zhang       */
2803e6e9a74fSStefano Zampini       xptr = xarray;
2804afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2805213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2806afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2807afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2808afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2809afb2bd1cSJunchao Zhang        */
2810afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2811afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2812afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2813afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2814afb2bd1cSJunchao Zhang       }
2815afb2bd1cSJunchao Zhang      #endif
2816e6e9a74fSStefano Zampini     } else {
2817afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2818afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2819afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2820afb2bd1cSJunchao Zhang        */
2821afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2822e6e9a74fSStefano Zampini       dptr = zarray;
2823e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2824afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2825e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2826a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2827e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2828e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2829e6e9a74fSStefano Zampini       }
2830afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2831afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2832afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2833afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2834afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2835afb2bd1cSJunchao Zhang       }
2836afb2bd1cSJunchao Zhang      #endif
2837e6e9a74fSStefano Zampini     }
28389ae82921SPaul Mullowney 
2839afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2840aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2841afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2842afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2843afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2844afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2845afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2846afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2847afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2848afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2849afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2850afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2851afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2852afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2853afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2854afb2bd1cSJunchao Zhang 
2855afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2856afb2bd1cSJunchao Zhang       } else {
2857afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2858afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2859afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2860afb2bd1cSJunchao Zhang       }
2861afb2bd1cSJunchao Zhang 
2862afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2863afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
2864*1a2c6b5cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2865afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2866afb2bd1cSJunchao Zhang                                beta,
2867afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2868afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2869afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2870afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2871afb2bd1cSJunchao Zhang      #else
28727656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2873e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2874a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2875afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2876aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2877e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
287857d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2879afb2bd1cSJunchao Zhang      #endif
2880aa372e3fSPaul Mullowney     } else {
2881213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2882afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2883afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2884afb2bd1cSJunchao Zhang        #else
2885301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2886e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2887afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2888e6e9a74fSStefano Zampini                                  xptr, beta,
288957d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2890afb2bd1cSJunchao Zhang        #endif
2891a65300a6SPaul Mullowney       }
2892aa372e3fSPaul Mullowney     }
289305035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2894958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2895aa372e3fSPaul Mullowney 
2896e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2897213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2898213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2899213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2900e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2901213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29027656d835SStefano Zampini         }
2903213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2904c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29057656d835SStefano Zampini       }
29067656d835SStefano Zampini 
2907213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2908213423ffSJunchao Zhang       if (compressed) {
2909e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2910a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2911a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2912a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2913a0e72f99SJunchao Zhang          */
2914a0e72f99SJunchao Zhang        #if 0
2915a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2916a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2917a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2918e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2919c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
2920a0e72f99SJunchao Zhang        #else
2921a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
2922a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2923a0e72f99SJunchao Zhang        #endif
292405035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2925958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2926e6e9a74fSStefano Zampini       }
2927e6e9a74fSStefano Zampini     } else {
2928e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2929e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2930e6e9a74fSStefano Zampini       }
2931e6e9a74fSStefano Zampini     }
2932e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2933213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2934213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29359ae82921SPaul Mullowney   } catch(char *ex) {
29369ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29379ae82921SPaul Mullowney   }
2938e6e9a74fSStefano Zampini   if (yy) {
2939958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2940e6e9a74fSStefano Zampini   } else {
2941e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2942e6e9a74fSStefano Zampini   }
29439ae82921SPaul Mullowney   PetscFunctionReturn(0);
29449ae82921SPaul Mullowney }
29459ae82921SPaul Mullowney 
29466fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2947ca45077fSPaul Mullowney {
2948b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29496e111a19SKarl Rupp 
2950ca45077fSPaul Mullowney   PetscFunctionBegin;
2951e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2952ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2953ca45077fSPaul Mullowney }
2954ca45077fSPaul Mullowney 
29556fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29569ae82921SPaul Mullowney {
29579ae82921SPaul Mullowney   PetscErrorCode              ierr;
2958a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29599ae82921SPaul Mullowney   PetscFunctionBegin;
2960bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29613fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2962bc3f50f2SPaul Mullowney   }
29633fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29643fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2965a587d139SMark   if (d_mat) {
29663fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29673fa6b06aSMark Adams   }
29683fa6b06aSMark Adams 
29699ae82921SPaul Mullowney   PetscFunctionReturn(0);
29709ae82921SPaul Mullowney }
29719ae82921SPaul Mullowney 
29729ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2973e057df02SPaul Mullowney /*@
29749ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
2975e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
2976e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
2977e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
2978e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
2979e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
29809ae82921SPaul Mullowney 
2981d083f849SBarry Smith    Collective
29829ae82921SPaul Mullowney 
29839ae82921SPaul Mullowney    Input Parameters:
29849ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
29859ae82921SPaul Mullowney .  m - number of rows
29869ae82921SPaul Mullowney .  n - number of columns
29879ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
29889ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
29890298fd71SBarry Smith          (possibly different for each row) or NULL
29909ae82921SPaul Mullowney 
29919ae82921SPaul Mullowney    Output Parameter:
29929ae82921SPaul Mullowney .  A - the matrix
29939ae82921SPaul Mullowney 
29949ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
29959ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
29969ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
29979ae82921SPaul Mullowney 
29989ae82921SPaul Mullowney    Notes:
29999ae82921SPaul Mullowney    If nnz is given then nz is ignored
30009ae82921SPaul Mullowney 
30019ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30029ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30039ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30049ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30059ae82921SPaul Mullowney 
30069ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30070298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30089ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30099ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30109ae82921SPaul Mullowney 
30119ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30129ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30139ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30149ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30159ae82921SPaul Mullowney 
30169ae82921SPaul Mullowney    Level: intermediate
30179ae82921SPaul Mullowney 
3018e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30199ae82921SPaul Mullowney @*/
30209ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30219ae82921SPaul Mullowney {
30229ae82921SPaul Mullowney   PetscErrorCode ierr;
30239ae82921SPaul Mullowney 
30249ae82921SPaul Mullowney   PetscFunctionBegin;
30259ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30269ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30279ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30289ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30299ae82921SPaul Mullowney   PetscFunctionReturn(0);
30309ae82921SPaul Mullowney }
30319ae82921SPaul Mullowney 
30326fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30339ae82921SPaul Mullowney {
30349ae82921SPaul Mullowney   PetscErrorCode              ierr;
30353fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3036ab25e6cbSDominic Meiser 
30379ae82921SPaul Mullowney   PetscFunctionBegin;
30389ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30393fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30403fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3041470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30429ae82921SPaul Mullowney   } else {
3043470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3044aa372e3fSPaul Mullowney   }
30453fa6b06aSMark Adams   if (d_mat) {
30463fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30473fa6b06aSMark Adams     cudaError_t                err;
30483fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30493fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30503fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30513fa6b06aSMark Adams     if (a->compressedrow.use) {
30523fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30533fa6b06aSMark Adams     }
30543fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30553fa6b06aSMark Adams   }
3056c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3057ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3058ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3059ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3060fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3061ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30627e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30637e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30649ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30659ae82921SPaul Mullowney   PetscFunctionReturn(0);
30669ae82921SPaul Mullowney }
30679ae82921SPaul Mullowney 
3068ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
306995639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30709ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30719ff858a8SKarl Rupp {
30729ff858a8SKarl Rupp   PetscErrorCode ierr;
30739ff858a8SKarl Rupp 
30749ff858a8SKarl Rupp   PetscFunctionBegin;
30759ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3076ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
30779ff858a8SKarl Rupp   PetscFunctionReturn(0);
30789ff858a8SKarl Rupp }
30799ff858a8SKarl Rupp 
3080039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
308195639643SRichard Tran Mills {
3082e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3083a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3084039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3085039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3086039c6fbaSStefano Zampini   PetscScalar        *ay;
3087039c6fbaSStefano Zampini   const PetscScalar  *ax;
3088039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3089039c6fbaSStefano Zampini   cudaError_t        cerr;
3090e6e9a74fSStefano Zampini 
309195639643SRichard Tran Mills   PetscFunctionBegin;
3092a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3093a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3094039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3095a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3096a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3097a587d139SMark     PetscFunctionReturn(0);
309895639643SRichard Tran Mills   }
3099039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3100a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3101a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3102039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3103039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3104039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3105039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3106039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3107039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3108039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3109039c6fbaSStefano Zampini     if (eq) {
3110039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3111039c6fbaSStefano Zampini     }
3112039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3113039c6fbaSStefano Zampini   }
3114d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3115d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3116039c6fbaSStefano Zampini 
3117039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3118039c6fbaSStefano Zampini     cusparseStatus_t stat;
3119039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3120039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3121039c6fbaSStefano Zampini     size_t           bufferSize;
3122039c6fbaSStefano Zampini     void             *buffer;
3123039c6fbaSStefano Zampini #endif
3124039c6fbaSStefano Zampini 
3125039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3126039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3127039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3128039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3129039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3130039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3131039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3132039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3133039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3134039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3135039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3136039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3137039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3138039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3139039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3140039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3141039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3142039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3143039c6fbaSStefano Zampini #else
3144039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3145039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3146039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3147039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3148039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3149039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3150039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3151039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3152039c6fbaSStefano Zampini #endif
3153039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3154039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3155039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3156039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3157039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3158a587d139SMark     cublasHandle_t cublasv2handle;
3159039c6fbaSStefano Zampini     cublasStatus_t berr;
3160a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3161039c6fbaSStefano Zampini 
3162039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3163039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3164a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3165a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3166a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3167039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3168039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3169a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3170a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3171039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3172039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3173a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3174039c6fbaSStefano Zampini   } else {
3175a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3176d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3177a587d139SMark   }
317895639643SRichard Tran Mills   PetscFunctionReturn(0);
317995639643SRichard Tran Mills }
318095639643SRichard Tran Mills 
318133c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
318233c9ba73SStefano Zampini {
318333c9ba73SStefano Zampini   PetscErrorCode ierr;
318433c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
318533c9ba73SStefano Zampini   PetscScalar    *ay;
318633c9ba73SStefano Zampini   cudaError_t    cerr;
318733c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
318833c9ba73SStefano Zampini   cublasStatus_t berr;
318933c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
319033c9ba73SStefano Zampini 
319133c9ba73SStefano Zampini   PetscFunctionBegin;
319233c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
319333c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
319433c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
319533c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
319633c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
319733c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
319833c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
319933c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
320033c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
320133c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
320233c9ba73SStefano Zampini   PetscFunctionReturn(0);
320333c9ba73SStefano Zampini }
320433c9ba73SStefano Zampini 
32053fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32063fa6b06aSMark Adams {
32073fa6b06aSMark Adams   PetscErrorCode             ierr;
32087e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3209a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32107e8381f9SStefano Zampini 
32113fa6b06aSMark Adams   PetscFunctionBegin;
32123fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32133fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32147e8381f9SStefano Zampini     if (spptr->mat) {
32157e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32167e8381f9SStefano Zampini       if (matrix->values) {
32177e8381f9SStefano Zampini         both = PETSC_TRUE;
32187e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32197e8381f9SStefano Zampini       }
32207e8381f9SStefano Zampini     }
32217e8381f9SStefano Zampini     if (spptr->matTranspose) {
32227e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32237e8381f9SStefano Zampini       if (matrix->values) {
32247e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32257e8381f9SStefano Zampini       }
32267e8381f9SStefano Zampini     }
32273fa6b06aSMark Adams   }
3228a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3229a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3230a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32317e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3232a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32333fa6b06aSMark Adams 
32343fa6b06aSMark Adams   PetscFunctionReturn(0);
32353fa6b06aSMark Adams }
32363fa6b06aSMark Adams 
3237a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3238a587d139SMark {
3239a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3240a587d139SMark   PetscErrorCode ierr;
3241a587d139SMark 
3242a587d139SMark   PetscFunctionBegin;
3243a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3244a587d139SMark   if (flg) {
3245a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3246a587d139SMark 
324733c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3248a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3249a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3250a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3251a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3252a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3253a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3254a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3255a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3256fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3257c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3258a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3259a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3260a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3261a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3262a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3263fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3264a587d139SMark   } else {
326533c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3266a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3267a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3268a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3269a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3270a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3271a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3272a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3273a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3274fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3275c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3276a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3277a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3278a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3279a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3280a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3281fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3282a587d139SMark   }
3283a587d139SMark   A->boundtocpu = flg;
3284a587d139SMark   a->inode.use = flg;
3285a587d139SMark   PetscFunctionReturn(0);
3286a587d139SMark }
3287a587d139SMark 
328849735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
32899ae82921SPaul Mullowney {
32909ae82921SPaul Mullowney   PetscErrorCode   ierr;
3291aa372e3fSPaul Mullowney   cusparseStatus_t stat;
329249735bf3SStefano Zampini   Mat              B;
32939ae82921SPaul Mullowney 
32949ae82921SPaul Mullowney   PetscFunctionBegin;
3295832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
329649735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
329749735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
329849735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
329949735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
330049735bf3SStefano Zampini   }
330149735bf3SStefano Zampini   B = *newmat;
330249735bf3SStefano Zampini 
330334136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
330434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
330534136279SStefano Zampini 
330649735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33079ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3308e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3309e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3310e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3311a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3312*1a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3313d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3314d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3315d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3316d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3317d8132acaSStefano Zampini      #endif
3318*1a2c6b5cSJunchao Zhang       B->spptr = spptr;
33199ae82921SPaul Mullowney     } else {
3320e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3321e6e9a74fSStefano Zampini 
3322e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3323e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3324a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3325e6e9a74fSStefano Zampini       B->spptr = spptr;
33269ae82921SPaul Mullowney     }
3327e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
332849735bf3SStefano Zampini   }
3329693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33309ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
3331*1a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
33329ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
333395639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3334693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33352205254eSKarl Rupp 
3336e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33379ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3338bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33399ae82921SPaul Mullowney   PetscFunctionReturn(0);
33409ae82921SPaul Mullowney }
33419ae82921SPaul Mullowney 
334202fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
334302fe1965SBarry Smith {
334402fe1965SBarry Smith   PetscErrorCode ierr;
334502fe1965SBarry Smith 
334602fe1965SBarry Smith   PetscFunctionBegin;
334702fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33480ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
334902fe1965SBarry Smith   PetscFunctionReturn(0);
335002fe1965SBarry Smith }
335102fe1965SBarry Smith 
33523ca39a21SBarry Smith /*MC
3353e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3354e057df02SPaul Mullowney 
3355e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33562692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33572692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3358e057df02SPaul Mullowney 
3359e057df02SPaul Mullowney    Options Database Keys:
3360e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3361aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3362a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3363e057df02SPaul Mullowney 
3364e057df02SPaul Mullowney   Level: beginner
3365e057df02SPaul Mullowney 
33668468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3367e057df02SPaul Mullowney M*/
33687f756511SDominic Meiser 
336942c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat,MatFactorType,Mat*);
337042c9c57cSBarry Smith 
33710f39cd5aSBarry Smith 
33723ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
337342c9c57cSBarry Smith {
337442c9c57cSBarry Smith   PetscErrorCode ierr;
337542c9c57cSBarry Smith 
337642c9c57cSBarry Smith   PetscFunctionBegin;
33773ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33783ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33793ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
33803ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
338142c9c57cSBarry Smith   PetscFunctionReturn(0);
338242c9c57cSBarry Smith }
338329b38603SBarry Smith 
3384470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
33857f756511SDominic Meiser {
3386e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
33877f756511SDominic Meiser   cusparseStatus_t stat;
33887f756511SDominic Meiser 
33897f756511SDominic Meiser   PetscFunctionBegin;
33907f756511SDominic Meiser   if (*cusparsestruct) {
3391e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3392e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
33937f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
339481902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
33957e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
33967e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3397a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
33987e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3399e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34007f756511SDominic Meiser   }
34017f756511SDominic Meiser   PetscFunctionReturn(0);
34027f756511SDominic Meiser }
34037f756511SDominic Meiser 
34047f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34057f756511SDominic Meiser {
34067f756511SDominic Meiser   PetscFunctionBegin;
34077f756511SDominic Meiser   if (*mat) {
34087f756511SDominic Meiser     delete (*mat)->values;
34097f756511SDominic Meiser     delete (*mat)->column_indices;
34107f756511SDominic Meiser     delete (*mat)->row_offsets;
34117f756511SDominic Meiser     delete *mat;
34127f756511SDominic Meiser     *mat = 0;
34137f756511SDominic Meiser   }
34147f756511SDominic Meiser   PetscFunctionReturn(0);
34157f756511SDominic Meiser }
34167f756511SDominic Meiser 
3417470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34187f756511SDominic Meiser {
34197f756511SDominic Meiser   cusparseStatus_t stat;
34207f756511SDominic Meiser   PetscErrorCode   ierr;
34217f756511SDominic Meiser 
34227f756511SDominic Meiser   PetscFunctionBegin;
34237f756511SDominic Meiser   if (*trifactor) {
342457d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3425afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34267f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34271b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34282cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3429afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34301b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3431afb2bd1cSJunchao Zhang    #endif
3432da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34337f756511SDominic Meiser   }
34347f756511SDominic Meiser   PetscFunctionReturn(0);
34357f756511SDominic Meiser }
34367f756511SDominic Meiser 
3437470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34387f756511SDominic Meiser {
34397f756511SDominic Meiser   CsrMatrix        *mat;
34407f756511SDominic Meiser   cusparseStatus_t stat;
34417f756511SDominic Meiser   cudaError_t      err;
34427f756511SDominic Meiser 
34437f756511SDominic Meiser   PetscFunctionBegin;
34447f756511SDominic Meiser   if (*matstruct) {
34457f756511SDominic Meiser     if ((*matstruct)->mat) {
34467f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3447afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3448afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3449afb2bd1cSJunchao Zhang        #else
34507f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
345157d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3452afb2bd1cSJunchao Zhang        #endif
34537f756511SDominic Meiser       } else {
34547f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34557f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34567f756511SDominic Meiser       }
34577f756511SDominic Meiser     }
345857d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34597f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3460afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34617656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34627656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3463afb2bd1cSJunchao Zhang 
3464afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3465afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3466afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3467afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3468afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3469afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3470afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3471afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3472afb2bd1cSJunchao Zhang       }
3473afb2bd1cSJunchao Zhang     }
3474afb2bd1cSJunchao Zhang    #endif
34757f756511SDominic Meiser     delete *matstruct;
34767e8381f9SStefano Zampini     *matstruct = NULL;
34777f756511SDominic Meiser   }
34787f756511SDominic Meiser   PetscFunctionReturn(0);
34797f756511SDominic Meiser }
34807f756511SDominic Meiser 
3481ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
34827f756511SDominic Meiser {
3483e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3484e6e9a74fSStefano Zampini 
34857f756511SDominic Meiser   PetscFunctionBegin;
34867f756511SDominic Meiser   if (*trifactors) {
3487e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3488e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3489e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3490e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
34917f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
34927f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
34937f756511SDominic Meiser     delete (*trifactors)->workVector;
34947e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
34957e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
34967e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3497ccdfe979SStefano Zampini   }
3498ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3499ccdfe979SStefano Zampini }
3500ccdfe979SStefano Zampini 
3501ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3502ccdfe979SStefano Zampini {
3503e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3504ccdfe979SStefano Zampini   cusparseHandle_t handle;
3505ccdfe979SStefano Zampini   cusparseStatus_t stat;
3506ccdfe979SStefano Zampini 
3507ccdfe979SStefano Zampini   PetscFunctionBegin;
3508ccdfe979SStefano Zampini   if (*trifactors) {
3509e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35107f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
351157d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35127f756511SDominic Meiser     }
3513e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35147f756511SDominic Meiser   }
35157f756511SDominic Meiser   PetscFunctionReturn(0);
35167f756511SDominic Meiser }
35177e8381f9SStefano Zampini 
35187e8381f9SStefano Zampini struct IJCompare
35197e8381f9SStefano Zampini {
35207e8381f9SStefano Zampini   __host__ __device__
35217e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35227e8381f9SStefano Zampini   {
35237e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35247e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35257e8381f9SStefano Zampini     return false;
35267e8381f9SStefano Zampini   }
35277e8381f9SStefano Zampini };
35287e8381f9SStefano Zampini 
35297e8381f9SStefano Zampini struct IJEqual
35307e8381f9SStefano Zampini {
35317e8381f9SStefano Zampini   __host__ __device__
35327e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35337e8381f9SStefano Zampini   {
35347e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35357e8381f9SStefano Zampini     return true;
35367e8381f9SStefano Zampini   }
35377e8381f9SStefano Zampini };
35387e8381f9SStefano Zampini 
35397e8381f9SStefano Zampini struct IJDiff
35407e8381f9SStefano Zampini {
35417e8381f9SStefano Zampini   __host__ __device__
35427e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35437e8381f9SStefano Zampini   {
35447e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35457e8381f9SStefano Zampini   }
35467e8381f9SStefano Zampini };
35477e8381f9SStefano Zampini 
35487e8381f9SStefano Zampini struct IJSum
35497e8381f9SStefano Zampini {
35507e8381f9SStefano Zampini   __host__ __device__
35517e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35527e8381f9SStefano Zampini   {
35537e8381f9SStefano Zampini     return t1||t2;
35547e8381f9SStefano Zampini   }
35557e8381f9SStefano Zampini };
35567e8381f9SStefano Zampini 
35577e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3558e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35597e8381f9SStefano Zampini {
35607e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3561fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3562bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
356308391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35647e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35657e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35667e8381f9SStefano Zampini   cudaError_t                           cerr;
35677e8381f9SStefano Zampini   PetscInt                              n;
35687e8381f9SStefano Zampini 
35697e8381f9SStefano Zampini   PetscFunctionBegin;
35707e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35717e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
35727e8381f9SStefano Zampini   if (!cusp->cooPerm) {
35737e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35747e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
35757e8381f9SStefano Zampini     PetscFunctionReturn(0);
35767e8381f9SStefano Zampini   }
35777e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
35787e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3579e61fc153SStefano Zampini   if (!v) {
3580e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3581e61fc153SStefano Zampini     goto finalize;
35827e8381f9SStefano Zampini   }
3583e61fc153SStefano Zampini   n = cusp->cooPerm->size();
358408391a17SStefano Zampini   if (isCudaMem(v)) {
358508391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
358608391a17SStefano Zampini   } else {
3587e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3588e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
358908391a17SStefano Zampini     d_v = cooPerm_v->data();
3590e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
359108391a17SStefano Zampini   }
3592bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3593e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
35947e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3595bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
359608391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3597e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3598e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3599e61fc153SStefano Zampini       delete cooPerm_w;
36007e8381f9SStefano Zampini     } else {
360108391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36027e8381f9SStefano Zampini                                                                 matrix->values->begin()));
360308391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36047e8381f9SStefano Zampini                                                                 matrix->values->end()));
36057e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36067e8381f9SStefano Zampini     }
36077e8381f9SStefano Zampini   } else {
3608e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
360908391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3610e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36117e8381f9SStefano Zampini     } else {
361208391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36137e8381f9SStefano Zampini                                                                 matrix->values->begin()));
361408391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36157e8381f9SStefano Zampini                                                                 matrix->values->end()));
36167e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36177e8381f9SStefano Zampini     }
36187e8381f9SStefano Zampini   }
36197e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3620bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3621e61fc153SStefano Zampini finalize:
3622e61fc153SStefano Zampini   delete cooPerm_v;
36237e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3624e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3625fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3626fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3627fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3628fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3629fcdce8c4SStefano Zampini   a->reallocs         = 0;
3630fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3631fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3632fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3633fcdce8c4SStefano Zampini   A->num_ass++;
36347e8381f9SStefano Zampini   PetscFunctionReturn(0);
36357e8381f9SStefano Zampini }
36367e8381f9SStefano Zampini 
3637a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3638a49f1ed0SStefano Zampini {
3639a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3640a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3641a49f1ed0SStefano Zampini 
3642a49f1ed0SStefano Zampini   PetscFunctionBegin;
3643a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3644a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3645a49f1ed0SStefano Zampini   if (destroy) {
3646a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3647a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3648a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3649a49f1ed0SStefano Zampini   }
3650*1a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3651a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3652a49f1ed0SStefano Zampini }
3653a49f1ed0SStefano Zampini 
36547e8381f9SStefano Zampini #include <thrust/binary_search.h>
3655e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36567e8381f9SStefano Zampini {
36577e8381f9SStefano Zampini   PetscErrorCode     ierr;
36587e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36597e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36607e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36617e8381f9SStefano Zampini   cudaError_t        cerr;
36627e8381f9SStefano Zampini 
36637e8381f9SStefano Zampini   PetscFunctionBegin;
36647e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36657e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36667e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36677e8381f9SStefano Zampini   if (n != cooPerm_n) {
36687e8381f9SStefano Zampini     delete cusp->cooPerm;
36697e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36707e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36717e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
36727e8381f9SStefano Zampini   }
36737e8381f9SStefano Zampini   if (n) {
36747e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
36757e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
36767e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
36777e8381f9SStefano Zampini 
36787e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
36797e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
36807e8381f9SStefano Zampini 
36817e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
36827e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
36837e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
36847e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
36857e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
36867e8381f9SStefano Zampini 
368708391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
36887e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
36897e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
36907e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
36917e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
36927e8381f9SStefano Zampini 
36937e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
36947e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
36957e8381f9SStefano Zampini       delete cusp->cooPerm_a;
36967e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
36977e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
36987e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
36997e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37007e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37017e8381f9SStefano Zampini       w[0] = 0;
37027e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37037e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37047e8381f9SStefano Zampini     }
37057e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37067e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37077e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37087e8381f9SStefano Zampini                         ii.begin());
370908391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
371008391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37117e8381f9SStefano Zampini 
37127e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37137e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37147e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37157e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37167e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37177e8381f9SStefano Zampini     a->i[0] = 0;
37187e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37197e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3720fcdce8c4SStefano Zampini     a->rmax = 0;
37217e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37227e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37237e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37247e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37257e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37267e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37277e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37287e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37297e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3730fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37317e8381f9SStefano Zampini     }
3732fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37337e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37347e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3735fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37367e8381f9SStefano Zampini   } else {
37377e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37387e8381f9SStefano Zampini   }
3739e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37407e8381f9SStefano Zampini 
37417e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3742e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3743e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37447e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37457e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37467e8381f9SStefano Zampini   A->nonzerostate++;
37477e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3748a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37497e8381f9SStefano Zampini 
37507e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37517e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37527e8381f9SStefano Zampini   PetscFunctionReturn(0);
37537e8381f9SStefano Zampini }
3754ed502f03SStefano Zampini 
3755ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3756ed502f03SStefano Zampini {
3757ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3758ed502f03SStefano Zampini   CsrMatrix          *csr;
3759ed502f03SStefano Zampini   PetscErrorCode     ierr;
3760ed502f03SStefano Zampini 
3761ed502f03SStefano Zampini   PetscFunctionBegin;
3762ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3763ed502f03SStefano Zampini   PetscValidPointer(a,2);
3764ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3765ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3766ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
376733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3768ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3769ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3770ed502f03SStefano Zampini   *a = csr->values->data().get();
3771ed502f03SStefano Zampini   PetscFunctionReturn(0);
3772ed502f03SStefano Zampini }
3773ed502f03SStefano Zampini 
3774ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3775ed502f03SStefano Zampini {
3776ed502f03SStefano Zampini   PetscFunctionBegin;
3777ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3778ed502f03SStefano Zampini   PetscValidPointer(a,2);
3779ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3780ed502f03SStefano Zampini   *a = NULL;
3781ed502f03SStefano Zampini   PetscFunctionReturn(0);
3782ed502f03SStefano Zampini }
3783ed502f03SStefano Zampini 
3784039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3785039c6fbaSStefano Zampini {
3786039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3787039c6fbaSStefano Zampini   CsrMatrix          *csr;
3788039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3789039c6fbaSStefano Zampini 
3790039c6fbaSStefano Zampini   PetscFunctionBegin;
3791039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3792039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3793039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3794039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3795039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
379633c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3797039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3798039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3799039c6fbaSStefano Zampini   *a = csr->values->data().get();
3800039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3801a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3802039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3803039c6fbaSStefano Zampini }
3804039c6fbaSStefano Zampini 
3805039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3806039c6fbaSStefano Zampini {
3807039c6fbaSStefano Zampini   PetscErrorCode ierr;
3808039c6fbaSStefano Zampini 
3809039c6fbaSStefano Zampini   PetscFunctionBegin;
3810039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3811039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3812039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3813039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3814039c6fbaSStefano Zampini   *a = NULL;
3815039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3816039c6fbaSStefano Zampini }
3817039c6fbaSStefano Zampini 
3818ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3819ed502f03SStefano Zampini {
3820ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3821ed502f03SStefano Zampini   CsrMatrix          *csr;
3822a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3823ed502f03SStefano Zampini 
3824ed502f03SStefano Zampini   PetscFunctionBegin;
3825ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3826ed502f03SStefano Zampini   PetscValidPointer(a,2);
3827ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3828ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
382933c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3830ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3831ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3832ed502f03SStefano Zampini   *a = csr->values->data().get();
3833039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3834a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3835ed502f03SStefano Zampini   PetscFunctionReturn(0);
3836ed502f03SStefano Zampini }
3837ed502f03SStefano Zampini 
3838ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3839ed502f03SStefano Zampini {
3840ed502f03SStefano Zampini   PetscErrorCode ierr;
3841ed502f03SStefano Zampini 
3842ed502f03SStefano Zampini   PetscFunctionBegin;
3843ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3844ed502f03SStefano Zampini   PetscValidPointer(a,2);
3845ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3846ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3847ed502f03SStefano Zampini   *a = NULL;
3848ed502f03SStefano Zampini   PetscFunctionReturn(0);
3849ed502f03SStefano Zampini }
3850ed502f03SStefano Zampini 
3851ed502f03SStefano Zampini struct IJCompare4
3852ed502f03SStefano Zampini {
3853ed502f03SStefano Zampini   __host__ __device__
38542ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3855ed502f03SStefano Zampini   {
3856ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3857ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3858ed502f03SStefano Zampini     return false;
3859ed502f03SStefano Zampini   }
3860ed502f03SStefano Zampini };
3861ed502f03SStefano Zampini 
38628909a122SStefano Zampini struct Shift
38638909a122SStefano Zampini {
3864ed502f03SStefano Zampini   int _shift;
3865ed502f03SStefano Zampini 
3866ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3867ed502f03SStefano Zampini   __host__ __device__
3868ed502f03SStefano Zampini   inline int operator() (const int &c)
3869ed502f03SStefano Zampini   {
3870ed502f03SStefano Zampini     return c + _shift;
3871ed502f03SStefano Zampini   }
3872ed502f03SStefano Zampini };
3873ed502f03SStefano Zampini 
3874ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3875ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3876ed502f03SStefano Zampini {
3877ed502f03SStefano Zampini   PetscErrorCode               ierr;
3878ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3879ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3880ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3881ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3882ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3883ed502f03SStefano Zampini   cusparseStatus_t             stat;
3884ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3885ed502f03SStefano Zampini   cudaError_t                  cerr;
3886ed502f03SStefano Zampini 
3887ed502f03SStefano Zampini   PetscFunctionBegin;
3888ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3889ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3890ed502f03SStefano Zampini   PetscValidPointer(C,4);
3891ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3892ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3893ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3894ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3895ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3896ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3897ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3898ed502f03SStefano Zampini     m     = A->rmap->n;
3899ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3900ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3901ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3902ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3903ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3904ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3905ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3906ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3907ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3908ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3909ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3910ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3911ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3912ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3913ed502f03SStefano Zampini     Ccusp->nrows    = m;
3914ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3915ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3916ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3917ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3918ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3919ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3920ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3921ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3922ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3923ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3924ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3925ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3926ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3927ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3928ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
3929*1a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
3930*1a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3931ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3932ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3933ed502f03SStefano Zampini 
3934ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3935ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3936ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3937ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3938ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3939ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3940ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3941ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3942ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3943ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3944ed502f03SStefano Zampini     if (c->nz) {
39452ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39462ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39472ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39482ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39492ed87e7eSStefano Zampini 
3950ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3951ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3952ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3953ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3954ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3955ed502f03SStefano Zampini         }
39562ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39572ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3958ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3959ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3960ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3961ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3962ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3963ed502f03SStefano Zampini         }
39642ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39652ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3966ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39672ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39682ed87e7eSStefano Zampini                               Aroff->data().get(),
39692ed87e7eSStefano Zampini                               Annz,
39702ed87e7eSStefano Zampini                               m,
39712ed87e7eSStefano Zampini                               Acoo->data().get(),
39722ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3973ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
39742ed87e7eSStefano Zampini                               Broff->data().get(),
3975ed502f03SStefano Zampini                               Bnnz,
3976ed502f03SStefano Zampini                               m,
39772ed87e7eSStefano Zampini                               Bcoo->data().get(),
3978ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
39792ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
39802ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
39812ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
39828909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
3983ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
3984ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
39858909a122SStefano Zampini #else
39868909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
39878909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
39888909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
39898909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
39908909a122SStefano Zampini #endif
39912ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
39922ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
39932ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
39942ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
39952ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
39962ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
3997ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
3998ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
3999ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40002ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40018909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40028909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40038909a122SStefano Zampini #endif
40042ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40052ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40062ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40072ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40082ed87e7eSStefano Zampini #else
40092ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40102ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40112ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40122ed87e7eSStefano Zampini #endif
4013ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40142ed87e7eSStefano Zampini                               Ccoo->data().get(),
4015ed502f03SStefano Zampini                               c->nz,
4016ed502f03SStefano Zampini                               m,
4017ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4018ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4019ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4020ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40212ed87e7eSStefano Zampini       delete wPerm;
40222ed87e7eSStefano Zampini       delete Acoo;
40232ed87e7eSStefano Zampini       delete Bcoo;
40242ed87e7eSStefano Zampini       delete Ccoo;
4025ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4026ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4027ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4028ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4029ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4030ed502f03SStefano Zampini #endif
4031*1a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4032ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4033ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4034ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4035ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4036ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4037ed502f03SStefano Zampini 
4038*1a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
4039*1a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4040a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4041ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4042ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4043ed502f03SStefano Zampini         CcsrT->num_rows = n;
4044ed502f03SStefano Zampini         CcsrT->num_cols = m;
4045ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4046ed502f03SStefano Zampini 
4047ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4048ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4049ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4050ed502f03SStefano Zampini 
4051ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4052ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4053ed502f03SStefano Zampini         if (AT) {
4054ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4055ed502f03SStefano Zampini           thrust::advance(rT,-1);
4056ed502f03SStefano Zampini         }
4057ed502f03SStefano Zampini         if (BT) {
4058ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4059ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4060ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4061ed502f03SStefano Zampini         }
4062ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4063ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4064ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4065ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4066ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4067ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4068ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4069ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4070ed502f03SStefano Zampini 
4071ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4072ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4073ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4074ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4075ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4076ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4077ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4078ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4079ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4080ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4081ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4082ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4083ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4084ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4085ed502f03SStefano Zampini #endif
4086ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4087ed502f03SStefano Zampini       }
4088ed502f03SStefano Zampini     }
4089ed502f03SStefano Zampini 
4090ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4091ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4092ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4093ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4094ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4095ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4096ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4097ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4098ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4099ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4100ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4101ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4102ed502f03SStefano Zampini     } else {
4103ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4104ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4105ed502f03SStefano Zampini     }
4106ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4107ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4108ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4109ed502f03SStefano Zampini     c->maxnz = c->nz;
4110ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4111ed502f03SStefano Zampini     c->rmax = 0;
4112ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4113ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4114ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4115ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4116ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4117ed502f03SStefano Zampini     }
4118ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4119ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4120ed502f03SStefano Zampini     (*C)->nonzerostate++;
4121ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4122ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4123ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4124ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4125ed502f03SStefano Zampini   } else {
4126ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4127ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4128ed502f03SStefano Zampini     if (c->nz) {
4129ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4130ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4131ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4132ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4133ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4134ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4135ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4136ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4137ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4138ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4139ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4140ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4141ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4142ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4143ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4144ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4145ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4146ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4147ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4148ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4149ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4150ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4151ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4152ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4153ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4154ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4155ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4156ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4157ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4158a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
4159*1a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4160ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4161ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4162ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4163ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4164ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4165ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4166ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4167ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4168*1a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4169ed502f03SStefano Zampini       }
4170ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4171ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4172ed502f03SStefano Zampini     }
4173ed502f03SStefano Zampini   }
4174ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4175ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4176ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4177ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4178ed502f03SStefano Zampini   PetscFunctionReturn(0);
4179ed502f03SStefano Zampini }
4180c215019aSStefano Zampini 
4181c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4182c215019aSStefano Zampini {
4183c215019aSStefano Zampini   PetscErrorCode    ierr;
4184c215019aSStefano Zampini   bool              dmem;
4185c215019aSStefano Zampini   const PetscScalar *av;
4186c215019aSStefano Zampini   cudaError_t       cerr;
4187c215019aSStefano Zampini 
4188c215019aSStefano Zampini   PetscFunctionBegin;
4189c215019aSStefano Zampini   dmem = isCudaMem(v);
4190c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4191c215019aSStefano Zampini   if (n && idx) {
4192c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4193c215019aSStefano Zampini     widx.assign(idx,idx+n);
4194c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4195c215019aSStefano Zampini 
4196c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4197c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4198c215019aSStefano Zampini     if (dmem) {
4199c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4200c215019aSStefano Zampini     } else {
4201c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4202c215019aSStefano Zampini       dv = w->data();
4203c215019aSStefano Zampini     }
4204c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4205c215019aSStefano Zampini 
4206c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4207c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4208c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4209c215019aSStefano Zampini     if (w) {
4210c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4211c215019aSStefano Zampini     }
4212c215019aSStefano Zampini     delete w;
4213c215019aSStefano Zampini   } else {
4214c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4215c215019aSStefano Zampini   }
4216c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4217c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4218c215019aSStefano Zampini   PetscFunctionReturn(0);
4219c215019aSStefano Zampini }
4220