xref: /petsc/src/mat/impls/aij/seq/seqcusparse/aijcusparse.cu (revision 4ac6704c710c1f28695377ac78b2ce44e2406750)
19ae82921SPaul Mullowney /*
29ae82921SPaul Mullowney   Defines the basic matrix operations for the AIJ (compressed row)
3fd7c363cSSatish Balay   matrix storage format using the CUSPARSE library,
49ae82921SPaul Mullowney */
5dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
653800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
799acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
89ae82921SPaul Mullowney 
93d13b8fdSMatthew G. Knepley #include <petscconf.h>
103d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
11087f3262SPaul Mullowney #include <../src/mat/impls/sbaij/seq/sbaij.h>
123d13b8fdSMatthew G. Knepley #include <../src/vec/vec/impls/dvecimpl.h>
13af0996ceSBarry Smith #include <petsc/private/vecimpl.h>
149ae82921SPaul Mullowney #undef VecType
153d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
16a0e72f99SJunchao Zhang #include <thrust/async/for_each.h>
17bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
18bddcd29dSMark Adams #include <cooperative_groups.h>
19bddcd29dSMark Adams #endif
20e057df02SPaul Mullowney const char *const MatCUSPARSEStorageFormats[]    = {"CSR","ELL","HYB","MatCUSPARSEStorageFormat","MAT_CUSPARSE_",0};
21afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
22afb2bd1cSJunchao Zhang   /* The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in
23afb2bd1cSJunchao Zhang     0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them.
24afb2bd1cSJunchao Zhang 
25afb2bd1cSJunchao Zhang   typedef enum {
26afb2bd1cSJunchao Zhang       CUSPARSE_MV_ALG_DEFAULT = 0,
27afb2bd1cSJunchao Zhang       CUSPARSE_COOMV_ALG      = 1,
28afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG1     = 2,
29afb2bd1cSJunchao Zhang       CUSPARSE_CSRMV_ALG2     = 3
30afb2bd1cSJunchao Zhang   } cusparseSpMVAlg_t;
31afb2bd1cSJunchao Zhang 
32afb2bd1cSJunchao Zhang   typedef enum {
33afb2bd1cSJunchao Zhang       CUSPARSE_MM_ALG_DEFAULT     CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
34afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1)    = 1,
35afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG2         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2)    = 2,
36afb2bd1cSJunchao Zhang       CUSPARSE_COOMM_ALG3         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3)    = 3,
37afb2bd1cSJunchao Zhang       CUSPARSE_CSRMM_ALG1         CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1)    = 4,
38afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_ALG_DEFAULT = 0,
39afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG1    = 1,
40afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG2    = 2,
41afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG3    = 3,
42afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_COO_ALG4    = 5,
43afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG1    = 4,
44afb2bd1cSJunchao Zhang       CUSPARSE_SPMM_CSR_ALG2    = 6,
45afb2bd1cSJunchao Zhang   } cusparseSpMMAlg_t;
46afb2bd1cSJunchao Zhang 
47afb2bd1cSJunchao Zhang   typedef enum {
48afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
49afb2bd1cSJunchao Zhang       CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
50afb2bd1cSJunchao Zhang   } cusparseCsr2CscAlg_t;
51afb2bd1cSJunchao Zhang   */
52afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMVAlgorithms[]    = {"MV_ALG_DEFAULT","COOMV_ALG", "CSRMV_ALG1","CSRMV_ALG2", "cusparseSpMVAlg_t","CUSPARSE_",0};
53afb2bd1cSJunchao Zhang   const char *const MatCUSPARSESpMMAlgorithms[]    = {"ALG_DEFAULT","COO_ALG1","COO_ALG2","COO_ALG3","CSR_ALG1","COO_ALG4","CSR_ALG2","cusparseSpMMAlg_t","CUSPARSE_SPMM_",0};
54afb2bd1cSJunchao Zhang   const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID"/*cusparse does not have enum 0! We created one*/,"ALG1","ALG2","cusparseCsr2CscAlg_t","CUSPARSE_CSR2CSC_",0};
55afb2bd1cSJunchao Zhang #endif
569ae82921SPaul Mullowney 
57087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
58087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,const MatFactorInfo*);
59087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
60087f3262SPaul Mullowney 
61bddcd29dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat,Mat,IS,IS,const MatFactorInfo*);
62bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat,Mat,const MatFactorInfo*);
636fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
646fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat,Mat,IS,IS,const MatFactorInfo*);
656fa9248bSJed Brown static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat,Mat,const MatFactorInfo*);
66087f3262SPaul Mullowney 
676fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat,Vec,Vec);
686fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
696fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
706fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat,Vec,Vec);
714416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat);
72a587d139SMark static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat,PetscScalar,Mat,MatStructure);
7333c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat,PetscScalar);
746fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat,Vec,Vec);
756fa9248bSJed Brown static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
766fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
776fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
78e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat,Vec,Vec);
79e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec);
80e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat,Vec,Vec,Vec,PetscBool,PetscBool);
819ae82921SPaul Mullowney 
827f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix**);
83470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct**);
84470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct**,MatCUSPARSEStorageFormat);
85ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors**);
86470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors**);
87470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE**);
887f756511SDominic Meiser 
8957181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
9057181aedSStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat);
91a49f1ed0SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat,PetscBool);
9257181aedSStefano Zampini 
937e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]);
947e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode);
957e8381f9SStefano Zampini 
96c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],PetscScalar[]);
97c215019aSStefano Zampini 
98b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetStream(Mat A,const cudaStream_t stream)
99b06137fdSPaul Mullowney {
100b06137fdSPaul Mullowney   cusparseStatus_t   stat;
101b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
102b06137fdSPaul Mullowney 
103b06137fdSPaul Mullowney   PetscFunctionBegin;
104d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
105b06137fdSPaul Mullowney   cusparsestruct->stream = stream;
10657d48284SJunchao Zhang   stat = cusparseSetStream(cusparsestruct->handle,cusparsestruct->stream);CHKERRCUSPARSE(stat);
107b06137fdSPaul Mullowney   PetscFunctionReturn(0);
108b06137fdSPaul Mullowney }
109b06137fdSPaul Mullowney 
110b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSESetHandle(Mat A,const cusparseHandle_t handle)
111b06137fdSPaul Mullowney {
112b06137fdSPaul Mullowney   cusparseStatus_t   stat;
113b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
114b06137fdSPaul Mullowney 
115b06137fdSPaul Mullowney   PetscFunctionBegin;
116d98d7c49SStefano Zampini   if (!cusparsestruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
1176b1cf21dSAlejandro Lamas Daviña   if (cusparsestruct->handle != handle) {
11816a2e217SAlejandro Lamas Daviña     if (cusparsestruct->handle) {
11957d48284SJunchao Zhang       stat = cusparseDestroy(cusparsestruct->handle);CHKERRCUSPARSE(stat);
12016a2e217SAlejandro Lamas Daviña     }
121b06137fdSPaul Mullowney     cusparsestruct->handle = handle;
1226b1cf21dSAlejandro Lamas Daviña   }
12357d48284SJunchao Zhang   stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
124b06137fdSPaul Mullowney   PetscFunctionReturn(0);
125b06137fdSPaul Mullowney }
126b06137fdSPaul Mullowney 
127b06137fdSPaul Mullowney PetscErrorCode MatCUSPARSEClearHandle(Mat A)
128b06137fdSPaul Mullowney {
129b06137fdSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1307e8381f9SStefano Zampini   PetscBool          flg;
1317e8381f9SStefano Zampini   PetscErrorCode     ierr;
132ccdfe979SStefano Zampini 
133b06137fdSPaul Mullowney   PetscFunctionBegin;
1347e8381f9SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
1357e8381f9SStefano Zampini   if (!flg || !cusparsestruct) PetscFunctionReturn(0);
136ccdfe979SStefano Zampini   if (cusparsestruct->handle) cusparsestruct->handle = 0;
137b06137fdSPaul Mullowney   PetscFunctionReturn(0);
138b06137fdSPaul Mullowney }
139b06137fdSPaul Mullowney 
140ea799195SBarry Smith PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat A,MatSolverType *type)
1419ae82921SPaul Mullowney {
1429ae82921SPaul Mullowney   PetscFunctionBegin;
1439ae82921SPaul Mullowney   *type = MATSOLVERCUSPARSE;
1449ae82921SPaul Mullowney   PetscFunctionReturn(0);
1459ae82921SPaul Mullowney }
1469ae82921SPaul Mullowney 
147c708e6cdSJed Brown /*MC
148087f3262SPaul Mullowney   MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices
149087f3262SPaul Mullowney   on a single GPU of type, seqaijcusparse, aijcusparse, or seqaijcusp, aijcusp. Currently supported
150087f3262SPaul Mullowney   algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer
151087f3262SPaul Mullowney   performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the
152087f3262SPaul Mullowney   CUSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these
153087f3262SPaul Mullowney   algorithms are not recommended. This class does NOT support direct solver operations.
154c708e6cdSJed Brown 
1559ae82921SPaul Mullowney   Level: beginner
156c708e6cdSJed Brown 
1573ca39a21SBarry Smith .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
158c708e6cdSJed Brown M*/
1599ae82921SPaul Mullowney 
16042c9c57cSBarry Smith PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A,MatFactorType ftype,Mat *B)
1619ae82921SPaul Mullowney {
1629ae82921SPaul Mullowney   PetscErrorCode ierr;
163bc3f50f2SPaul Mullowney   PetscInt       n = A->rmap->n;
1649ae82921SPaul Mullowney 
1659ae82921SPaul Mullowney   PetscFunctionBegin;
166bc3f50f2SPaul Mullowney   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
167bc3f50f2SPaul Mullowney   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
1682c7c0729SBarry Smith   (*B)->factortype = ftype;
1699ae82921SPaul Mullowney   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1702205254eSKarl Rupp 
171087f3262SPaul Mullowney   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
17233d57670SJed Brown     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
1739ae82921SPaul Mullowney     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
1749ae82921SPaul Mullowney     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSE;
175*4ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr);
176*4ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILU]);CHKERRQ(ierr);
177*4ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ILUDT]);CHKERRQ(ierr);
178087f3262SPaul Mullowney   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
179087f3262SPaul Mullowney     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJCUSPARSE;
180087f3262SPaul Mullowney     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE;
181*4ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]);CHKERRQ(ierr);
182*4ac6704cSBarry Smith     ierr = PetscStrallocpy(MATORDERINGNATURAL,(char**)&(*B)->preferredordering[MAT_FACTOR_ICC]);CHKERRQ(ierr);
1839ae82921SPaul Mullowney   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSE Matrix Types");
184bc3f50f2SPaul Mullowney 
185fa03d054SJed Brown   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
186*4ac6704cSBarry Smith   (*B)->canuseordering = PETSC_TRUE;
1873ca39a21SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse);CHKERRQ(ierr);
1889ae82921SPaul Mullowney   PetscFunctionReturn(0);
1899ae82921SPaul Mullowney }
1909ae82921SPaul Mullowney 
191bc3f50f2SPaul Mullowney PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
192ca45077fSPaul Mullowney {
193aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1946e111a19SKarl Rupp 
195ca45077fSPaul Mullowney   PetscFunctionBegin;
196ca45077fSPaul Mullowney   switch (op) {
197e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT:
198aa372e3fSPaul Mullowney     cusparsestruct->format = format;
199ca45077fSPaul Mullowney     break;
200e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
201aa372e3fSPaul Mullowney     cusparsestruct->format = format;
202ca45077fSPaul Mullowney     break;
203ca45077fSPaul Mullowney   default:
20436d62e41SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.",op);
205ca45077fSPaul Mullowney   }
206ca45077fSPaul Mullowney   PetscFunctionReturn(0);
207ca45077fSPaul Mullowney }
2089ae82921SPaul Mullowney 
209e057df02SPaul Mullowney /*@
210e057df02SPaul Mullowney    MatCUSPARSESetFormat - Sets the storage format of CUSPARSE matrices for a particular
211e057df02SPaul Mullowney    operation. Only the MatMult operation can use different GPU storage formats
212aa372e3fSPaul Mullowney    for MPIAIJCUSPARSE matrices.
213e057df02SPaul Mullowney    Not Collective
214e057df02SPaul Mullowney 
215e057df02SPaul Mullowney    Input Parameters:
2168468deeeSKarl Rupp +  A - Matrix of type SEQAIJCUSPARSE
21736d62e41SPaul Mullowney .  op - MatCUSPARSEFormatOperation. SEQAIJCUSPARSE matrices support MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL. MPIAIJCUSPARSE matrices support MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_OFFDIAG, and MAT_CUSPARSE_ALL.
2182692e278SPaul Mullowney -  format - MatCUSPARSEStorageFormat (one of MAT_CUSPARSE_CSR, MAT_CUSPARSE_ELL, MAT_CUSPARSE_HYB. The latter two require CUDA 4.2)
219e057df02SPaul Mullowney 
220e057df02SPaul Mullowney    Output Parameter:
221e057df02SPaul Mullowney 
222e057df02SPaul Mullowney    Level: intermediate
223e057df02SPaul Mullowney 
2248468deeeSKarl Rupp .seealso: MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
225e057df02SPaul Mullowney @*/
226e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
227e057df02SPaul Mullowney {
228e057df02SPaul Mullowney   PetscErrorCode ierr;
2296e111a19SKarl Rupp 
230e057df02SPaul Mullowney   PetscFunctionBegin;
231e057df02SPaul Mullowney   PetscValidHeaderSpecific(A, MAT_CLASSID,1);
232e057df02SPaul Mullowney   ierr = PetscTryMethod(A,"MatCUSPARSESetFormat_C",(Mat,MatCUSPARSEFormatOperation,MatCUSPARSEStorageFormat),(A,op,format));CHKERRQ(ierr);
233e057df02SPaul Mullowney   PetscFunctionReturn(0);
234e057df02SPaul Mullowney }
235e057df02SPaul Mullowney 
2361a2c6b5cSJunchao Zhang PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A,MatOption op,PetscBool flg)
237e6e9a74fSStefano Zampini {
238e6e9a74fSStefano Zampini   PetscErrorCode ierr;
239e6e9a74fSStefano Zampini 
240e6e9a74fSStefano Zampini   PetscFunctionBegin;
2411a2c6b5cSJunchao Zhang   switch (op) {
2421a2c6b5cSJunchao Zhang     case MAT_FORM_EXPLICIT_TRANSPOSE:
2431a2c6b5cSJunchao Zhang       /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
2441a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && !flg) {ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);}
2451a2c6b5cSJunchao Zhang       A->form_explicit_transpose = flg;
2461a2c6b5cSJunchao Zhang       break;
2471a2c6b5cSJunchao Zhang     default:
2481a2c6b5cSJunchao Zhang       ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr);
2491a2c6b5cSJunchao Zhang       break;
250e6e9a74fSStefano Zampini   }
251e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
252e6e9a74fSStefano Zampini }
253e6e9a74fSStefano Zampini 
254bddcd29dSMark Adams static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A);
255bddcd29dSMark Adams 
256bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
257bddcd29dSMark Adams {
258bddcd29dSMark Adams   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
259bddcd29dSMark Adams   IS             isrow = b->row,iscol = b->col;
260bddcd29dSMark Adams   PetscBool      row_identity,col_identity;
261bddcd29dSMark Adams   PetscErrorCode ierr;
262bddcd29dSMark Adams 
263bddcd29dSMark Adams   PetscFunctionBegin;
264bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
265bddcd29dSMark Adams   ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
266bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_CPU;
267bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. */
268bddcd29dSMark Adams   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
269bddcd29dSMark Adams   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
270bddcd29dSMark Adams   if (row_identity && col_identity) {
271bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
272bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
273bddcd29dSMark Adams     B->ops->matsolve = NULL;
274bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
275bddcd29dSMark Adams   } else {
276bddcd29dSMark Adams     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
277bddcd29dSMark Adams     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
278bddcd29dSMark Adams     B->ops->matsolve = NULL;
279bddcd29dSMark Adams     B->ops->matsolvetranspose = NULL;
280bddcd29dSMark Adams   }
281bddcd29dSMark Adams 
282bddcd29dSMark Adams   /* get the triangular factors */
283bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
284bddcd29dSMark Adams   PetscFunctionReturn(0);
285bddcd29dSMark Adams }
286bddcd29dSMark Adams 
2874416b707SBarry Smith static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
2889ae82921SPaul Mullowney {
2899ae82921SPaul Mullowney   PetscErrorCode           ierr;
290e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
2919ae82921SPaul Mullowney   PetscBool                flg;
292a183c035SDominic Meiser   Mat_SeqAIJCUSPARSE       *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
2936e111a19SKarl Rupp 
2949ae82921SPaul Mullowney   PetscFunctionBegin;
295e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"SeqAIJCUSPARSE options");CHKERRQ(ierr);
2969ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
297e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV",
298a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
299afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT,format);CHKERRQ(ierr);}
300afb2bd1cSJunchao Zhang 
3014c87dfd4SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve",
302a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparsestruct->format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
303afb2bd1cSJunchao Zhang     if (flg) {ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);}
304afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
305afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmv_alg","sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)",
306afb2bd1cSJunchao Zhang                             "cusparseSpMVAlg_t",MatCUSPARSESpMVAlgorithms,(PetscEnum)cusparsestruct->spmvAlg,(PetscEnum*)&cusparsestruct->spmvAlg,&flg);CHKERRQ(ierr);
307afb2bd1cSJunchao Zhang     /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */
308afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSRMV_ALG1 != 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly");
309afb2bd1cSJunchao Zhang 
310afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_spmm_alg","sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)",
311afb2bd1cSJunchao Zhang                             "cusparseSpMMAlg_t",MatCUSPARSESpMMAlgorithms,(PetscEnum)cusparsestruct->spmmAlg,(PetscEnum*)&cusparsestruct->spmmAlg,&flg);CHKERRQ(ierr);
312afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_SPMM_CSR_ALG1 != 4) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly");
313afb2bd1cSJunchao Zhang 
314afb2bd1cSJunchao Zhang     ierr = PetscOptionsEnum("-mat_cusparse_csr2csc_alg","sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices",
315afb2bd1cSJunchao Zhang                             "cusparseCsr2CscAlg_t",MatCUSPARSECsr2CscAlgorithms,(PetscEnum)cusparsestruct->csr2cscAlg,(PetscEnum*)&cusparsestruct->csr2cscAlg,&flg);CHKERRQ(ierr);
316afb2bd1cSJunchao Zhang     if (flg && CUSPARSE_CSR2CSC_ALG1 != 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly");
317afb2bd1cSJunchao Zhang    #endif
3184c87dfd4SPaul Mullowney   }
3190af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
3209ae82921SPaul Mullowney   PetscFunctionReturn(0);
3219ae82921SPaul Mullowney }
3229ae82921SPaul Mullowney 
3236fa9248bSJed Brown static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3249ae82921SPaul Mullowney {
325da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3269ae82921SPaul Mullowney   PetscErrorCode               ierr;
3279ae82921SPaul Mullowney 
3289ae82921SPaul Mullowney   PetscFunctionBegin;
329da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3309ae82921SPaul Mullowney   ierr = MatILUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3319ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3329ae82921SPaul Mullowney   PetscFunctionReturn(0);
3339ae82921SPaul Mullowney }
3349ae82921SPaul Mullowney 
3356fa9248bSJed Brown static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3369ae82921SPaul Mullowney {
337da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
3389ae82921SPaul Mullowney   PetscErrorCode               ierr;
3399ae82921SPaul Mullowney 
3409ae82921SPaul Mullowney   PetscFunctionBegin;
341da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
3429ae82921SPaul Mullowney   ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr);
3439ae82921SPaul Mullowney   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE;
3449ae82921SPaul Mullowney   PetscFunctionReturn(0);
3459ae82921SPaul Mullowney }
3469ae82921SPaul Mullowney 
347087f3262SPaul Mullowney static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
348087f3262SPaul Mullowney {
349da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
350087f3262SPaul Mullowney   PetscErrorCode               ierr;
351087f3262SPaul Mullowney 
352087f3262SPaul Mullowney   PetscFunctionBegin;
353da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
354087f3262SPaul Mullowney   ierr = MatICCFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
355087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
356087f3262SPaul Mullowney   PetscFunctionReturn(0);
357087f3262SPaul Mullowney }
358087f3262SPaul Mullowney 
359087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B,Mat A,IS perm,const MatFactorInfo *info)
360087f3262SPaul Mullowney {
361da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
362087f3262SPaul Mullowney   PetscErrorCode               ierr;
363087f3262SPaul Mullowney 
364087f3262SPaul Mullowney   PetscFunctionBegin;
365da79fbbcSStefano Zampini   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
366087f3262SPaul Mullowney   ierr = MatCholeskyFactorSymbolic_SeqAIJ(B,A,perm,info);CHKERRQ(ierr);
367087f3262SPaul Mullowney   B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE;
368087f3262SPaul Mullowney   PetscFunctionReturn(0);
369087f3262SPaul Mullowney }
370087f3262SPaul Mullowney 
371087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A)
3729ae82921SPaul Mullowney {
3739ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
3749ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
3759ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
376aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
3779ae82921SPaul Mullowney   cusparseStatus_t                  stat;
3789ae82921SPaul Mullowney   const PetscInt                    *ai = a->i,*aj = a->j,*vi;
3799ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
3809ae82921SPaul Mullowney   PetscInt                          *AiLo, *AjLo;
3819ae82921SPaul Mullowney   PetscInt                          i,nz, nzLower, offset, rowOffset;
382b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
38357d48284SJunchao Zhang   cudaError_t                       cerr;
3849ae82921SPaul Mullowney 
3859ae82921SPaul Mullowney   PetscFunctionBegin;
386cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
387c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
3889ae82921SPaul Mullowney     try {
3899ae82921SPaul Mullowney       /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */
3909ae82921SPaul Mullowney       nzLower=n+ai[n]-ai[1];
391da79fbbcSStefano Zampini       if (!loTriFactor) {
3922cbc15d9SMark         PetscScalar                       *AALo;
3932cbc15d9SMark 
3942cbc15d9SMark         cerr = cudaMallocHost((void**) &AALo, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney         /* Allocate Space for the lower triangular matrix */
39757d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiLo, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
39857d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjLo, nzLower*sizeof(PetscInt));CHKERRCUDA(cerr);
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney         /* Fill the lower triangular matrix */
4019ae82921SPaul Mullowney         AiLo[0]  = (PetscInt) 0;
4029ae82921SPaul Mullowney         AiLo[n]  = nzLower;
4039ae82921SPaul Mullowney         AjLo[0]  = (PetscInt) 0;
4049ae82921SPaul Mullowney         AALo[0]  = (MatScalar) 1.0;
4059ae82921SPaul Mullowney         v        = aa;
4069ae82921SPaul Mullowney         vi       = aj;
4079ae82921SPaul Mullowney         offset   = 1;
4089ae82921SPaul Mullowney         rowOffset= 1;
4099ae82921SPaul Mullowney         for (i=1; i<n; i++) {
4109ae82921SPaul Mullowney           nz = ai[i+1] - ai[i];
411e057df02SPaul Mullowney           /* additional 1 for the term on the diagonal */
4129ae82921SPaul Mullowney           AiLo[i]    = rowOffset;
4139ae82921SPaul Mullowney           rowOffset += nz+1;
4149ae82921SPaul Mullowney 
415580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjLo[offset]), vi, nz);CHKERRQ(ierr);
416580bdb30SBarry Smith           ierr = PetscArraycpy(&(AALo[offset]), v, nz);CHKERRQ(ierr);
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney           offset      += nz;
4199ae82921SPaul Mullowney           AjLo[offset] = (PetscInt) i;
4209ae82921SPaul Mullowney           AALo[offset] = (MatScalar) 1.0;
4219ae82921SPaul Mullowney           offset      += 1;
4229ae82921SPaul Mullowney 
4239ae82921SPaul Mullowney           v  += nz;
4249ae82921SPaul Mullowney           vi += nz;
4259ae82921SPaul Mullowney         }
4262205254eSKarl Rupp 
427aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
428da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
429da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
430aa372e3fSPaul Mullowney         /* Create the matrix description */
43157d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
43257d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4331b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
434afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
435afb2bd1cSJunchao Zhang        #else
43657d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
437afb2bd1cSJunchao Zhang        #endif
43857d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER);CHKERRCUSPARSE(stat);
43957d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
440aa372e3fSPaul Mullowney 
441aa372e3fSPaul Mullowney         /* set the operation */
442aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
443aa372e3fSPaul Mullowney 
444aa372e3fSPaul Mullowney         /* set the matrix */
445aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
446aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = n;
447aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = n;
448aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = nzLower;
449aa372e3fSPaul Mullowney 
450aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
451aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo+n+1);
452aa372e3fSPaul Mullowney 
453aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower);
454aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjLo, AjLo+nzLower);
455aa372e3fSPaul Mullowney 
456aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(nzLower);
457aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+nzLower);
458aa372e3fSPaul Mullowney 
459afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
460da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
461afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
4621b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
463afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
464afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
465afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
466afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
467afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
468afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
469afb2bd1cSJunchao Zhang       #endif
470afb2bd1cSJunchao Zhang 
471aa372e3fSPaul Mullowney         /* perform the solve analysis */
472aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
473aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
474aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
475afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
4761b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
477afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
478afb2bd1cSJunchao Zhang                                #endif
479afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
480da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
481da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
482aa372e3fSPaul Mullowney 
483da79fbbcSStefano Zampini         /* assign the pointer */
484aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
4852cbc15d9SMark         loTriFactor->AA_h = AALo;
48657d48284SJunchao Zhang         cerr = cudaFreeHost(AiLo);CHKERRCUDA(cerr);
48757d48284SJunchao Zhang         cerr = cudaFreeHost(AjLo);CHKERRCUDA(cerr);
4884863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzLower)*sizeof(int)+nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
489da79fbbcSStefano Zampini       } else { /* update values only */
4902cbc15d9SMark         if (!loTriFactor->AA_h) {
4912cbc15d9SMark           cerr = cudaMallocHost((void**) &loTriFactor->AA_h, nzLower*sizeof(PetscScalar));CHKERRCUDA(cerr);
4922cbc15d9SMark         }
493da79fbbcSStefano Zampini         /* Fill the lower triangular matrix */
4942cbc15d9SMark         loTriFactor->AA_h[0]  = 1.0;
495da79fbbcSStefano Zampini         v        = aa;
496da79fbbcSStefano Zampini         vi       = aj;
497da79fbbcSStefano Zampini         offset   = 1;
498da79fbbcSStefano Zampini         for (i=1; i<n; i++) {
499da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i];
5002cbc15d9SMark           ierr = PetscArraycpy(&(loTriFactor->AA_h[offset]), v, nz);CHKERRQ(ierr);
501da79fbbcSStefano Zampini           offset      += nz;
5022cbc15d9SMark           loTriFactor->AA_h[offset] = 1.0;
503da79fbbcSStefano Zampini           offset      += 1;
504da79fbbcSStefano Zampini           v  += nz;
505da79fbbcSStefano Zampini         }
5062cbc15d9SMark         loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h+nzLower);
507da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzLower*sizeof(PetscScalar));CHKERRQ(ierr);
508da79fbbcSStefano Zampini       }
5099ae82921SPaul Mullowney     } catch(char *ex) {
5109ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
5119ae82921SPaul Mullowney     }
5129ae82921SPaul Mullowney   }
5139ae82921SPaul Mullowney   PetscFunctionReturn(0);
5149ae82921SPaul Mullowney }
5159ae82921SPaul Mullowney 
516087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A)
5179ae82921SPaul Mullowney {
5189ae82921SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
5199ae82921SPaul Mullowney   PetscInt                          n = A->rmap->n;
5209ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
521aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
5229ae82921SPaul Mullowney   cusparseStatus_t                  stat;
5239ae82921SPaul Mullowney   const PetscInt                    *aj = a->j,*adiag = a->diag,*vi;
5249ae82921SPaul Mullowney   const MatScalar                   *aa = a->a,*v;
5259ae82921SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
5269ae82921SPaul Mullowney   PetscInt                          i,nz, nzUpper, offset;
5279ae82921SPaul Mullowney   PetscErrorCode                    ierr;
52857d48284SJunchao Zhang   cudaError_t                       cerr;
5299ae82921SPaul Mullowney 
5309ae82921SPaul Mullowney   PetscFunctionBegin;
531cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
532c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
5339ae82921SPaul Mullowney     try {
5349ae82921SPaul Mullowney       /* next, figure out the number of nonzeros in the upper triangular matrix. */
5359ae82921SPaul Mullowney       nzUpper = adiag[0]-adiag[n];
536da79fbbcSStefano Zampini       if (!upTriFactor) {
5372cbc15d9SMark         PetscScalar *AAUp;
5382cbc15d9SMark 
5392cbc15d9SMark         cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
5402cbc15d9SMark 
5419ae82921SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
54257d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
54357d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
5449ae82921SPaul Mullowney 
5459ae82921SPaul Mullowney         /* Fill the upper triangular matrix */
5469ae82921SPaul Mullowney         AiUp[0]=(PetscInt) 0;
5479ae82921SPaul Mullowney         AiUp[n]=nzUpper;
5489ae82921SPaul Mullowney         offset = nzUpper;
5499ae82921SPaul Mullowney         for (i=n-1; i>=0; i--) {
5509ae82921SPaul Mullowney           v  = aa + adiag[i+1] + 1;
5519ae82921SPaul Mullowney           vi = aj + adiag[i+1] + 1;
5529ae82921SPaul Mullowney 
553e057df02SPaul Mullowney           /* number of elements NOT on the diagonal */
5549ae82921SPaul Mullowney           nz = adiag[i] - adiag[i+1]-1;
5559ae82921SPaul Mullowney 
556e057df02SPaul Mullowney           /* decrement the offset */
5579ae82921SPaul Mullowney           offset -= (nz+1);
5589ae82921SPaul Mullowney 
559e057df02SPaul Mullowney           /* first, set the diagonal elements */
5609ae82921SPaul Mullowney           AjUp[offset] = (PetscInt) i;
56109f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1./v[nz];
5629ae82921SPaul Mullowney           AiUp[i]      = AiUp[i+1] - (nz+1);
5639ae82921SPaul Mullowney 
564580bdb30SBarry Smith           ierr = PetscArraycpy(&(AjUp[offset+1]), vi, nz);CHKERRQ(ierr);
565580bdb30SBarry Smith           ierr = PetscArraycpy(&(AAUp[offset+1]), v, nz);CHKERRQ(ierr);
5669ae82921SPaul Mullowney         }
5672205254eSKarl Rupp 
568aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
569da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
570da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
5712205254eSKarl Rupp 
572aa372e3fSPaul Mullowney         /* Create the matrix description */
57357d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
57457d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
5751b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
576afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
577afb2bd1cSJunchao Zhang        #else
57857d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
579afb2bd1cSJunchao Zhang        #endif
58057d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
58157d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
582aa372e3fSPaul Mullowney 
583aa372e3fSPaul Mullowney         /* set the operation */
584aa372e3fSPaul Mullowney         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
585aa372e3fSPaul Mullowney 
586aa372e3fSPaul Mullowney         /* set the matrix */
587aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
588aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = n;
589aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = n;
590aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = nzUpper;
591aa372e3fSPaul Mullowney 
592aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n+1);
593aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+n+1);
594aa372e3fSPaul Mullowney 
595aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper);
596aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+nzUpper);
597aa372e3fSPaul Mullowney 
598aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper);
599aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+nzUpper);
600aa372e3fSPaul Mullowney 
601afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
602da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
603afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
6041b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
605afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
606afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
607afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
608afb2bd1cSJunchao Zhang                                      upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
609afb2bd1cSJunchao Zhang                                      &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
610afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
611afb2bd1cSJunchao Zhang       #endif
612afb2bd1cSJunchao Zhang 
613aa372e3fSPaul Mullowney         /* perform the solve analysis */
614aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
615aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
616aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
617afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
6181b0a6780SStefano Zampini                                #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
619afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
620afb2bd1cSJunchao Zhang                                #endif
621afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
622da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
623da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
624aa372e3fSPaul Mullowney 
625da79fbbcSStefano Zampini         /* assign the pointer */
626aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
6272cbc15d9SMark         upTriFactor->AA_h = AAUp;
62857d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
62957d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
6304863603aSSatish Balay         ierr = PetscLogCpuToGpu((n+1+nzUpper)*sizeof(int)+nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
631da79fbbcSStefano Zampini       } else {
6322cbc15d9SMark         if (!upTriFactor->AA_h) {
6332cbc15d9SMark           cerr = cudaMallocHost((void**) &upTriFactor->AA_h, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
6342cbc15d9SMark         }
635da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
636da79fbbcSStefano Zampini         offset = nzUpper;
637da79fbbcSStefano Zampini         for (i=n-1; i>=0; i--) {
638da79fbbcSStefano Zampini           v  = aa + adiag[i+1] + 1;
639da79fbbcSStefano Zampini 
640da79fbbcSStefano Zampini           /* number of elements NOT on the diagonal */
641da79fbbcSStefano Zampini           nz = adiag[i] - adiag[i+1]-1;
642da79fbbcSStefano Zampini 
643da79fbbcSStefano Zampini           /* decrement the offset */
644da79fbbcSStefano Zampini           offset -= (nz+1);
645da79fbbcSStefano Zampini 
646da79fbbcSStefano Zampini           /* first, set the diagonal elements */
6472cbc15d9SMark           upTriFactor->AA_h[offset] = 1./v[nz];
6482cbc15d9SMark           ierr = PetscArraycpy(&(upTriFactor->AA_h[offset+1]), v, nz);CHKERRQ(ierr);
649da79fbbcSStefano Zampini         }
6502cbc15d9SMark         upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h+nzUpper);
651da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(nzUpper*sizeof(PetscScalar));CHKERRQ(ierr);
652da79fbbcSStefano Zampini       }
6539ae82921SPaul Mullowney     } catch(char *ex) {
6549ae82921SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
6559ae82921SPaul Mullowney     }
6569ae82921SPaul Mullowney   }
6579ae82921SPaul Mullowney   PetscFunctionReturn(0);
6589ae82921SPaul Mullowney }
6599ae82921SPaul Mullowney 
660087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A)
6619ae82921SPaul Mullowney {
6629ae82921SPaul Mullowney   PetscErrorCode               ierr;
6639ae82921SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
6649ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
6659ae82921SPaul Mullowney   IS                           isrow = a->row,iscol = a->icol;
6669ae82921SPaul Mullowney   PetscBool                    row_identity,col_identity;
6679ae82921SPaul Mullowney   PetscInt                     n = A->rmap->n;
6689ae82921SPaul Mullowney 
6699ae82921SPaul Mullowney   PetscFunctionBegin;
670da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
671087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A);CHKERRQ(ierr);
672087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A);CHKERRQ(ierr);
6732205254eSKarl Rupp 
674da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
675aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=a->nz;
6769ae82921SPaul Mullowney 
677c70f7ee4SJunchao Zhang   A->offloadmask = PETSC_OFFLOAD_BOTH;
678e057df02SPaul Mullowney   /* lower triangular indices */
6799ae82921SPaul Mullowney   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
680da79fbbcSStefano Zampini   if (!row_identity && !cusparseTriFactors->rpermIndices) {
681da79fbbcSStefano Zampini     const PetscInt *r;
682da79fbbcSStefano Zampini 
683da79fbbcSStefano Zampini     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
684aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
685aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(r, r+n);
6869ae82921SPaul Mullowney     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
687da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
688da79fbbcSStefano Zampini   }
6899ae82921SPaul Mullowney 
690e057df02SPaul Mullowney   /* upper triangular indices */
6919ae82921SPaul Mullowney   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
692da79fbbcSStefano Zampini   if (!col_identity && !cusparseTriFactors->cpermIndices) {
693da79fbbcSStefano Zampini     const PetscInt *c;
694da79fbbcSStefano Zampini 
695da79fbbcSStefano Zampini     ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr);
696aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
697aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices->assign(c, c+n);
6989ae82921SPaul Mullowney     ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr);
699da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
700da79fbbcSStefano Zampini   }
7019ae82921SPaul Mullowney   PetscFunctionReturn(0);
7029ae82921SPaul Mullowney }
7039ae82921SPaul Mullowney 
704087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A)
705087f3262SPaul Mullowney {
706087f3262SPaul Mullowney   Mat_SeqAIJ                        *a = (Mat_SeqAIJ*)A->data;
707087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
708aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
709aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
710087f3262SPaul Mullowney   cusparseStatus_t                  stat;
711087f3262SPaul Mullowney   PetscErrorCode                    ierr;
71257d48284SJunchao Zhang   cudaError_t                       cerr;
713087f3262SPaul Mullowney   PetscInt                          *AiUp, *AjUp;
714087f3262SPaul Mullowney   PetscScalar                       *AAUp;
715087f3262SPaul Mullowney   PetscScalar                       *AALo;
716087f3262SPaul Mullowney   PetscInt                          nzUpper = a->nz,n = A->rmap->n,i,offset,nz,j;
717087f3262SPaul Mullowney   Mat_SeqSBAIJ                      *b = (Mat_SeqSBAIJ*)A->data;
718087f3262SPaul Mullowney   const PetscInt                    *ai = b->i,*aj = b->j,*vj;
719087f3262SPaul Mullowney   const MatScalar                   *aa = b->a,*v;
720087f3262SPaul Mullowney 
721087f3262SPaul Mullowney   PetscFunctionBegin;
722cf00fe3bSKarl Rupp   if (!n) PetscFunctionReturn(0);
723c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
724087f3262SPaul Mullowney     try {
725da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AAUp, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
726da79fbbcSStefano Zampini       cerr = cudaMallocHost((void**) &AALo, nzUpper*sizeof(PetscScalar));CHKERRCUDA(cerr);
727da79fbbcSStefano Zampini       if (!upTriFactor && !loTriFactor) {
728087f3262SPaul Mullowney         /* Allocate Space for the upper triangular matrix */
72957d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AiUp, (n+1)*sizeof(PetscInt));CHKERRCUDA(cerr);
73057d48284SJunchao Zhang         cerr = cudaMallocHost((void**) &AjUp, nzUpper*sizeof(PetscInt));CHKERRCUDA(cerr);
731087f3262SPaul Mullowney 
732087f3262SPaul Mullowney         /* Fill the upper triangular matrix */
733087f3262SPaul Mullowney         AiUp[0]=(PetscInt) 0;
734087f3262SPaul Mullowney         AiUp[n]=nzUpper;
735087f3262SPaul Mullowney         offset = 0;
736087f3262SPaul Mullowney         for (i=0; i<n; i++) {
737087f3262SPaul Mullowney           /* set the pointers */
738087f3262SPaul Mullowney           v  = aa + ai[i];
739087f3262SPaul Mullowney           vj = aj + ai[i];
740087f3262SPaul Mullowney           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
741087f3262SPaul Mullowney 
742087f3262SPaul Mullowney           /* first, set the diagonal elements */
743087f3262SPaul Mullowney           AjUp[offset] = (PetscInt) i;
74409f51544SAlejandro Lamas Daviña           AAUp[offset] = (MatScalar)1.0/v[nz];
745087f3262SPaul Mullowney           AiUp[i]      = offset;
74609f51544SAlejandro Lamas Daviña           AALo[offset] = (MatScalar)1.0/v[nz];
747087f3262SPaul Mullowney 
748087f3262SPaul Mullowney           offset+=1;
749087f3262SPaul Mullowney           if (nz>0) {
750f22e0265SBarry Smith             ierr = PetscArraycpy(&(AjUp[offset]), vj, nz);CHKERRQ(ierr);
751580bdb30SBarry Smith             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
752087f3262SPaul Mullowney             for (j=offset; j<offset+nz; j++) {
753087f3262SPaul Mullowney               AAUp[j] = -AAUp[j];
754087f3262SPaul Mullowney               AALo[j] = AAUp[j]/v[nz];
755087f3262SPaul Mullowney             }
756087f3262SPaul Mullowney             offset+=nz;
757087f3262SPaul Mullowney           }
758087f3262SPaul Mullowney         }
759087f3262SPaul Mullowney 
760aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
761da79fbbcSStefano Zampini         ierr = PetscNew(&upTriFactor);CHKERRQ(ierr);
762da79fbbcSStefano Zampini         upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
763087f3262SPaul Mullowney 
764aa372e3fSPaul Mullowney         /* Create the matrix description */
76557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&upTriFactor->descr);CHKERRCUSPARSE(stat);
76657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
7671b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
768afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
769afb2bd1cSJunchao Zhang        #else
77057d48284SJunchao Zhang         stat = cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
771afb2bd1cSJunchao Zhang        #endif
77257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
77357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT);CHKERRCUSPARSE(stat);
774087f3262SPaul Mullowney 
775aa372e3fSPaul Mullowney         /* set the matrix */
776aa372e3fSPaul Mullowney         upTriFactor->csrMat = new CsrMatrix;
777aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_rows = A->rmap->n;
778aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_cols = A->cmap->n;
779aa372e3fSPaul Mullowney         upTriFactor->csrMat->num_entries = a->nz;
780aa372e3fSPaul Mullowney 
781aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
782aa372e3fSPaul Mullowney         upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
783aa372e3fSPaul Mullowney 
784aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
785aa372e3fSPaul Mullowney         upTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
786aa372e3fSPaul Mullowney 
787aa372e3fSPaul Mullowney         upTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
788aa372e3fSPaul Mullowney         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
789aa372e3fSPaul Mullowney 
790afb2bd1cSJunchao Zhang         /* set the operation */
791afb2bd1cSJunchao Zhang         upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
792afb2bd1cSJunchao Zhang 
793afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
794da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
795afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&upTriFactor->solveInfo);CHKERRCUSPARSE(stat);
7961b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
797afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactor->solveOp,
798afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
799afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
800afb2bd1cSJunchao Zhang                                        upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo,
801afb2bd1cSJunchao Zhang                                        &upTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
802afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&upTriFactor->solveBuffer,upTriFactor->solveBufferSize);CHKERRCUDA(cerr);
803afb2bd1cSJunchao Zhang       #endif
804afb2bd1cSJunchao Zhang 
805aa372e3fSPaul Mullowney         /* perform the solve analysis */
806aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactor->solveOp,
807aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr,
808aa372e3fSPaul Mullowney                                  upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(),
809afb2bd1cSJunchao Zhang                                  upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo
8101b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
811afb2bd1cSJunchao Zhang                                  ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
812afb2bd1cSJunchao Zhang                                 #endif
813afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
814da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
815da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
816aa372e3fSPaul Mullowney 
817da79fbbcSStefano Zampini         /* assign the pointer */
818aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtr = upTriFactor;
819aa372e3fSPaul Mullowney 
820aa372e3fSPaul Mullowney         /* allocate space for the triangular factor information */
821da79fbbcSStefano Zampini         ierr = PetscNew(&loTriFactor);CHKERRQ(ierr);
822da79fbbcSStefano Zampini         loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
823aa372e3fSPaul Mullowney 
824aa372e3fSPaul Mullowney         /* Create the matrix description */
82557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&loTriFactor->descr);CHKERRCUSPARSE(stat);
82657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
8271b0a6780SStefano Zampini        #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
828afb2bd1cSJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
829afb2bd1cSJunchao Zhang        #else
83057d48284SJunchao Zhang         stat = cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR);CHKERRCUSPARSE(stat);
831afb2bd1cSJunchao Zhang        #endif
83257d48284SJunchao Zhang         stat = cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER);CHKERRCUSPARSE(stat);
83357d48284SJunchao Zhang         stat = cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT);CHKERRCUSPARSE(stat);
834aa372e3fSPaul Mullowney 
835aa372e3fSPaul Mullowney         /* set the operation */
836aa372e3fSPaul Mullowney         loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE;
837aa372e3fSPaul Mullowney 
838aa372e3fSPaul Mullowney         /* set the matrix */
839aa372e3fSPaul Mullowney         loTriFactor->csrMat = new CsrMatrix;
840aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_rows = A->rmap->n;
841aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_cols = A->cmap->n;
842aa372e3fSPaul Mullowney         loTriFactor->csrMat->num_entries = a->nz;
843aa372e3fSPaul Mullowney 
844aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
845aa372e3fSPaul Mullowney         loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp+A->rmap->n+1);
846aa372e3fSPaul Mullowney 
847aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz);
848aa372e3fSPaul Mullowney         loTriFactor->csrMat->column_indices->assign(AjUp, AjUp+a->nz);
849aa372e3fSPaul Mullowney 
850aa372e3fSPaul Mullowney         loTriFactor->csrMat->values = new THRUSTARRAY(a->nz);
851aa372e3fSPaul Mullowney         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
852aa372e3fSPaul Mullowney 
853afb2bd1cSJunchao Zhang         /* Create the solve analysis information */
854da79fbbcSStefano Zampini         ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
855afb2bd1cSJunchao Zhang         stat = cusparse_create_analysis_info(&loTriFactor->solveInfo);CHKERRCUSPARSE(stat);
8561b0a6780SStefano Zampini       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
857afb2bd1cSJunchao Zhang         stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactor->solveOp,
858afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
859afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
860afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo,
861afb2bd1cSJunchao Zhang                                        &loTriFactor->solveBufferSize);CHKERRCUSPARSE(stat);
862afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&loTriFactor->solveBuffer,loTriFactor->solveBufferSize);CHKERRCUDA(cerr);
863afb2bd1cSJunchao Zhang       #endif
864afb2bd1cSJunchao Zhang 
865aa372e3fSPaul Mullowney         /* perform the solve analysis */
866aa372e3fSPaul Mullowney         stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactor->solveOp,
867aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr,
868aa372e3fSPaul Mullowney                                  loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(),
869afb2bd1cSJunchao Zhang                                  loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo
8701b0a6780SStefano Zampini                                 #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
871afb2bd1cSJunchao Zhang                                  ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
872afb2bd1cSJunchao Zhang                                 #endif
873afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
874da79fbbcSStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
875da79fbbcSStefano Zampini         ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
876aa372e3fSPaul Mullowney 
877da79fbbcSStefano Zampini         /* assign the pointer */
878aa372e3fSPaul Mullowney         ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtr = loTriFactor;
879087f3262SPaul Mullowney 
880da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(((A->rmap->n+1)+(a->nz))*sizeof(int)+(a->nz)*sizeof(PetscScalar)));CHKERRQ(ierr);
88157d48284SJunchao Zhang         cerr = cudaFreeHost(AiUp);CHKERRCUDA(cerr);
88257d48284SJunchao Zhang         cerr = cudaFreeHost(AjUp);CHKERRCUDA(cerr);
883da79fbbcSStefano Zampini       } else {
884da79fbbcSStefano Zampini         /* Fill the upper triangular matrix */
885da79fbbcSStefano Zampini         offset = 0;
886da79fbbcSStefano Zampini         for (i=0; i<n; i++) {
887da79fbbcSStefano Zampini           /* set the pointers */
888da79fbbcSStefano Zampini           v  = aa + ai[i];
889da79fbbcSStefano Zampini           nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
890da79fbbcSStefano Zampini 
891da79fbbcSStefano Zampini           /* first, set the diagonal elements */
892da79fbbcSStefano Zampini           AAUp[offset] = 1.0/v[nz];
893da79fbbcSStefano Zampini           AALo[offset] = 1.0/v[nz];
894da79fbbcSStefano Zampini 
895da79fbbcSStefano Zampini           offset+=1;
896da79fbbcSStefano Zampini           if (nz>0) {
897da79fbbcSStefano Zampini             ierr = PetscArraycpy(&(AAUp[offset]), v, nz);CHKERRQ(ierr);
898da79fbbcSStefano Zampini             for (j=offset; j<offset+nz; j++) {
899da79fbbcSStefano Zampini               AAUp[j] = -AAUp[j];
900da79fbbcSStefano Zampini               AALo[j] = AAUp[j]/v[nz];
901da79fbbcSStefano Zampini             }
902da79fbbcSStefano Zampini             offset+=nz;
903da79fbbcSStefano Zampini           }
904da79fbbcSStefano Zampini         }
905da79fbbcSStefano Zampini         if (!upTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
906da79fbbcSStefano Zampini         if (!loTriFactor) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
907da79fbbcSStefano Zampini         upTriFactor->csrMat->values->assign(AAUp, AAUp+a->nz);
908da79fbbcSStefano Zampini         loTriFactor->csrMat->values->assign(AALo, AALo+a->nz);
909da79fbbcSStefano Zampini         ierr = PetscLogCpuToGpu(2*(a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
910da79fbbcSStefano Zampini       }
91157d48284SJunchao Zhang       cerr = cudaFreeHost(AAUp);CHKERRCUDA(cerr);
91257d48284SJunchao Zhang       cerr = cudaFreeHost(AALo);CHKERRCUDA(cerr);
913087f3262SPaul Mullowney     } catch(char *ex) {
914087f3262SPaul Mullowney       SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
915087f3262SPaul Mullowney     }
916087f3262SPaul Mullowney   }
917087f3262SPaul Mullowney   PetscFunctionReturn(0);
918087f3262SPaul Mullowney }
919087f3262SPaul Mullowney 
920087f3262SPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A)
9219ae82921SPaul Mullowney {
9229ae82921SPaul Mullowney   PetscErrorCode               ierr;
923087f3262SPaul Mullowney   Mat_SeqAIJ                   *a                  = (Mat_SeqAIJ*)A->data;
924087f3262SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
925087f3262SPaul Mullowney   IS                           ip = a->row;
926087f3262SPaul Mullowney   PetscBool                    perm_identity;
927087f3262SPaul Mullowney   PetscInt                     n = A->rmap->n;
928087f3262SPaul Mullowney 
929087f3262SPaul Mullowney   PetscFunctionBegin;
930da79fbbcSStefano Zampini   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
931087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEBuildICCTriMatrices(A);CHKERRQ(ierr);
932da79fbbcSStefano Zampini   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
933aa372e3fSPaul Mullowney   cusparseTriFactors->nnz=(a->nz-n)*2 + n;
934aa372e3fSPaul Mullowney 
935da79fbbcSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_BOTH;
936da79fbbcSStefano Zampini 
937087f3262SPaul Mullowney   /* lower triangular indices */
938087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
939087f3262SPaul Mullowney   if (!perm_identity) {
9404e4bbfaaSStefano Zampini     IS             iip;
941da79fbbcSStefano Zampini     const PetscInt *irip,*rip;
9424e4bbfaaSStefano Zampini 
9434e4bbfaaSStefano Zampini     ierr = ISInvertPermutation(ip,PETSC_DECIDE,&iip);CHKERRQ(ierr);
9444e4bbfaaSStefano Zampini     ierr = ISGetIndices(iip,&irip);CHKERRQ(ierr);
945da79fbbcSStefano Zampini     ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr);
946aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
947aa372e3fSPaul Mullowney     cusparseTriFactors->rpermIndices->assign(rip, rip+n);
948aa372e3fSPaul Mullowney     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
9494e4bbfaaSStefano Zampini     cusparseTriFactors->cpermIndices->assign(irip, irip+n);
9504e4bbfaaSStefano Zampini     ierr = ISRestoreIndices(iip,&irip);CHKERRQ(ierr);
9514e4bbfaaSStefano Zampini     ierr = ISDestroy(&iip);CHKERRQ(ierr);
952087f3262SPaul Mullowney     ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr);
953da79fbbcSStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
954da79fbbcSStefano Zampini   }
955087f3262SPaul Mullowney   PetscFunctionReturn(0);
956087f3262SPaul Mullowney }
957087f3262SPaul Mullowney 
958bddcd29dSMark Adams #define CHECK_LAUNCH_ERROR()                                                             \
959bddcd29dSMark Adams do {                                                                                     \
960bddcd29dSMark Adams   /* Check synchronous errors, i.e. pre-launch */                                        \
961bddcd29dSMark Adams   cudaError_t err = cudaGetLastError();                                                  \
962bddcd29dSMark Adams   if (cudaSuccess != err) {                                                              \
963bddcd29dSMark Adams     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
964bddcd29dSMark Adams   }                                                                                      \
965bddcd29dSMark Adams   /* Check asynchronous errors, i.e. kernel failed (ULF) */                              \
966bddcd29dSMark Adams   err = cudaDeviceSynchronize();                                                         \
967bddcd29dSMark Adams   if (cudaSuccess != err) {                                                              \
968bddcd29dSMark Adams     SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Cuda error: %s",cudaGetErrorString(err)); \
969bddcd29dSMark Adams   }                                                                                      \
970bddcd29dSMark Adams  } while (0)
9719ae82921SPaul Mullowney 
972087f3262SPaul Mullowney static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B,Mat A,const MatFactorInfo *info)
973087f3262SPaul Mullowney {
974087f3262SPaul Mullowney   Mat_SeqAIJ     *b = (Mat_SeqAIJ*)B->data;
975087f3262SPaul Mullowney   IS             ip = b->row;
976087f3262SPaul Mullowney   PetscBool      perm_identity;
977b175d8bbSPaul Mullowney   PetscErrorCode ierr;
978087f3262SPaul Mullowney 
979087f3262SPaul Mullowney   PetscFunctionBegin;
98057181aedSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
981087f3262SPaul Mullowney   ierr = MatCholeskyFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr);
982ccdfe979SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
983087f3262SPaul Mullowney   /* determine which version of MatSolve needs to be used. */
984087f3262SPaul Mullowney   ierr = ISIdentity(ip,&perm_identity);CHKERRQ(ierr);
985087f3262SPaul Mullowney   if (perm_identity) {
986087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering;
987087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering;
9884e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9894e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
990087f3262SPaul Mullowney   } else {
991087f3262SPaul Mullowney     B->ops->solve = MatSolve_SeqAIJCUSPARSE;
992087f3262SPaul Mullowney     B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE;
9934e4bbfaaSStefano Zampini     B->ops->matsolve = NULL;
9944e4bbfaaSStefano Zampini     B->ops->matsolvetranspose = NULL;
995087f3262SPaul Mullowney   }
996087f3262SPaul Mullowney 
997087f3262SPaul Mullowney   /* get the triangular factors */
998087f3262SPaul Mullowney   ierr = MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B);CHKERRQ(ierr);
999087f3262SPaul Mullowney   PetscFunctionReturn(0);
1000087f3262SPaul Mullowney }
10019ae82921SPaul Mullowney 
1002b175d8bbSPaul Mullowney static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A)
1003bda325fcSPaul Mullowney {
1004bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1005aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1006aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1007da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT;
1008da79fbbcSStefano Zampini   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT;
1009bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1010aa372e3fSPaul Mullowney   cusparseIndexBase_t               indexBase;
1011aa372e3fSPaul Mullowney   cusparseMatrixType_t              matrixType;
1012aa372e3fSPaul Mullowney   cusparseFillMode_t                fillMode;
1013aa372e3fSPaul Mullowney   cusparseDiagType_t                diagType;
10141b0a6780SStefano Zampini   cudaError_t                       cerr;
1015da79fbbcSStefano Zampini   PetscErrorCode                    ierr;
1016b175d8bbSPaul Mullowney 
1017bda325fcSPaul Mullowney   PetscFunctionBegin;
1018aa372e3fSPaul Mullowney   /* allocate space for the transpose of the lower triangular factor */
1019da79fbbcSStefano Zampini   ierr = PetscNew(&loTriFactorT);CHKERRQ(ierr);
1020da79fbbcSStefano Zampini   loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1021aa372e3fSPaul Mullowney 
1022aa372e3fSPaul Mullowney   /* set the matrix descriptors of the lower triangular factor */
1023aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(loTriFactor->descr);
1024aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(loTriFactor->descr);
1025aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(loTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1026aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1027aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(loTriFactor->descr);
1028aa372e3fSPaul Mullowney 
1029aa372e3fSPaul Mullowney   /* Create the matrix description */
103057d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&loTriFactorT->descr);CHKERRCUSPARSE(stat);
103157d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(loTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
103257d48284SJunchao Zhang   stat = cusparseSetMatType(loTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
103357d48284SJunchao Zhang   stat = cusparseSetMatFillMode(loTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
103457d48284SJunchao Zhang   stat = cusparseSetMatDiagType(loTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1035aa372e3fSPaul Mullowney 
1036aa372e3fSPaul Mullowney   /* set the operation */
1037aa372e3fSPaul Mullowney   loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1038aa372e3fSPaul Mullowney 
1039aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the lower triangular factor*/
1040aa372e3fSPaul Mullowney   loTriFactorT->csrMat = new CsrMatrix;
1041afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_rows       = loTriFactor->csrMat->num_cols;
1042afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->num_cols       = loTriFactor->csrMat->num_rows;
1043aa372e3fSPaul Mullowney   loTriFactorT->csrMat->num_entries    = loTriFactor->csrMat->num_entries;
1044afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows+1);
1045afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries);
1046afb2bd1cSJunchao Zhang   loTriFactorT->csrMat->values         = new THRUSTARRAY(loTriFactorT->csrMat->num_entries);
1047aa372e3fSPaul Mullowney 
1048aa372e3fSPaul Mullowney   /* compute the transpose of the lower triangular factor, i.e. the CSC */
1049afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1050afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1051afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1052afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->values->data().get(),
1053afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->row_offsets->data().get(),
1054afb2bd1cSJunchao Zhang                                        loTriFactor->csrMat->column_indices->data().get(),
1055afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->values->data().get(),
1056afb2bd1cSJunchao Zhang                                        loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1057afb2bd1cSJunchao Zhang                                        CUSPARSE_ACTION_NUMERIC,indexBase,
1058afb2bd1cSJunchao Zhang                                        CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
10591b0a6780SStefano Zampini   cerr = cudaMalloc(&loTriFactor->csr2cscBuffer,loTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1060afb2bd1cSJunchao Zhang #endif
1061afb2bd1cSJunchao Zhang 
1062da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1063aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows,
1064aa372e3fSPaul Mullowney                           loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries,
1065aa372e3fSPaul Mullowney                           loTriFactor->csrMat->values->data().get(),
1066aa372e3fSPaul Mullowney                           loTriFactor->csrMat->row_offsets->data().get(),
1067aa372e3fSPaul Mullowney                           loTriFactor->csrMat->column_indices->data().get(),
1068aa372e3fSPaul Mullowney                           loTriFactorT->csrMat->values->data().get(),
1069afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1070afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1071afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1072afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer
1073afb2bd1cSJunchao Zhang                         #else
1074afb2bd1cSJunchao Zhang                           loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1075afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1076afb2bd1cSJunchao Zhang                         #endif
1077afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1078da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1079da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1080aa372e3fSPaul Mullowney 
1081afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1082da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1083afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&loTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
10841b0a6780SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1085afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, loTriFactorT->solveOp,
1086afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1087afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1088afb2bd1cSJunchao Zhang                                 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo,
1089afb2bd1cSJunchao Zhang                                 &loTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1090afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&loTriFactorT->solveBuffer,loTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1091afb2bd1cSJunchao Zhang #endif
1092afb2bd1cSJunchao Zhang 
1093afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1094aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp,
1095afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr,
1096afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(),
1097afb2bd1cSJunchao Zhang                            loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo
10981b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1099afb2bd1cSJunchao Zhang                            ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1100afb2bd1cSJunchao Zhang                           #endif
1101afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1102da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1103da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1104aa372e3fSPaul Mullowney 
1105da79fbbcSStefano Zampini   /* assign the pointer */
1106aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->loTriFactorPtrTranspose = loTriFactorT;
1107aa372e3fSPaul Mullowney 
1108aa372e3fSPaul Mullowney   /*********************************************/
1109aa372e3fSPaul Mullowney   /* Now the Transpose of the Upper Tri Factor */
1110aa372e3fSPaul Mullowney   /*********************************************/
1111aa372e3fSPaul Mullowney 
1112aa372e3fSPaul Mullowney   /* allocate space for the transpose of the upper triangular factor */
1113da79fbbcSStefano Zampini   ierr = PetscNew(&upTriFactorT);CHKERRQ(ierr);
1114da79fbbcSStefano Zampini   upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
1115aa372e3fSPaul Mullowney 
1116aa372e3fSPaul Mullowney   /* set the matrix descriptors of the upper triangular factor */
1117aa372e3fSPaul Mullowney   matrixType = cusparseGetMatType(upTriFactor->descr);
1118aa372e3fSPaul Mullowney   indexBase = cusparseGetMatIndexBase(upTriFactor->descr);
1119aa372e3fSPaul Mullowney   fillMode = cusparseGetMatFillMode(upTriFactor->descr)==CUSPARSE_FILL_MODE_UPPER ?
1120aa372e3fSPaul Mullowney     CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER;
1121aa372e3fSPaul Mullowney   diagType = cusparseGetMatDiagType(upTriFactor->descr);
1122aa372e3fSPaul Mullowney 
1123aa372e3fSPaul Mullowney   /* Create the matrix description */
112457d48284SJunchao Zhang   stat = cusparseCreateMatDescr(&upTriFactorT->descr);CHKERRCUSPARSE(stat);
112557d48284SJunchao Zhang   stat = cusparseSetMatIndexBase(upTriFactorT->descr, indexBase);CHKERRCUSPARSE(stat);
112657d48284SJunchao Zhang   stat = cusparseSetMatType(upTriFactorT->descr, matrixType);CHKERRCUSPARSE(stat);
112757d48284SJunchao Zhang   stat = cusparseSetMatFillMode(upTriFactorT->descr, fillMode);CHKERRCUSPARSE(stat);
112857d48284SJunchao Zhang   stat = cusparseSetMatDiagType(upTriFactorT->descr, diagType);CHKERRCUSPARSE(stat);
1129aa372e3fSPaul Mullowney 
1130aa372e3fSPaul Mullowney   /* set the operation */
1131aa372e3fSPaul Mullowney   upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE;
1132aa372e3fSPaul Mullowney 
1133aa372e3fSPaul Mullowney   /* allocate GPU space for the CSC of the upper triangular factor*/
1134aa372e3fSPaul Mullowney   upTriFactorT->csrMat = new CsrMatrix;
1135afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_rows       = upTriFactor->csrMat->num_cols;
1136afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->num_cols       = upTriFactor->csrMat->num_rows;
1137aa372e3fSPaul Mullowney   upTriFactorT->csrMat->num_entries    = upTriFactor->csrMat->num_entries;
1138afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->row_offsets    = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows+1);
1139afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries);
1140afb2bd1cSJunchao Zhang   upTriFactorT->csrMat->values         = new THRUSTARRAY(upTriFactorT->csrMat->num_entries);
1141aa372e3fSPaul Mullowney 
1142aa372e3fSPaul Mullowney   /* compute the transpose of the upper triangular factor, i.e. the CSC */
1143afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1144afb2bd1cSJunchao Zhang   stat = cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle,upTriFactor->csrMat->num_rows,
1145afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1146afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->values->data().get(),
1147afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->row_offsets->data().get(),
1148afb2bd1cSJunchao Zhang                                 upTriFactor->csrMat->column_indices->data().get(),
1149afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->values->data().get(),
1150afb2bd1cSJunchao Zhang                                 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1151afb2bd1cSJunchao Zhang                                 CUSPARSE_ACTION_NUMERIC,indexBase,
1152afb2bd1cSJunchao Zhang                                 CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize);CHKERRCUSPARSE(stat);
1153afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactor->csr2cscBuffer,upTriFactor->csr2cscBufferSize);CHKERRCUDA(cerr);
1154afb2bd1cSJunchao Zhang #endif
1155afb2bd1cSJunchao Zhang 
1156da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1157aa372e3fSPaul Mullowney   stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows,
1158aa372e3fSPaul Mullowney                           upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries,
1159aa372e3fSPaul Mullowney                           upTriFactor->csrMat->values->data().get(),
1160aa372e3fSPaul Mullowney                           upTriFactor->csrMat->row_offsets->data().get(),
1161aa372e3fSPaul Mullowney                           upTriFactor->csrMat->column_indices->data().get(),
1162aa372e3fSPaul Mullowney                           upTriFactorT->csrMat->values->data().get(),
1163afb2bd1cSJunchao Zhang                         #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1164afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype,
1165afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase,
1166afb2bd1cSJunchao Zhang                           CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer
1167afb2bd1cSJunchao Zhang                         #else
1168afb2bd1cSJunchao Zhang                           upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1169afb2bd1cSJunchao Zhang                           CUSPARSE_ACTION_NUMERIC, indexBase
1170afb2bd1cSJunchao Zhang                         #endif
1171afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1172da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1173da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1174aa372e3fSPaul Mullowney 
1175afb2bd1cSJunchao Zhang   /* Create the solve analysis information */
1176da79fbbcSStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1177afb2bd1cSJunchao Zhang   stat = cusparse_create_analysis_info(&upTriFactorT->solveInfo);CHKERRCUSPARSE(stat);
11781b0a6780SStefano Zampini   #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1179afb2bd1cSJunchao Zhang   stat = cusparse_get_svbuffsize(cusparseTriFactors->handle, upTriFactorT->solveOp,
1180afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1181afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1182afb2bd1cSJunchao Zhang                                  upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo,
1183afb2bd1cSJunchao Zhang                                  &upTriFactorT->solveBufferSize);CHKERRCUSPARSE(stat);
1184afb2bd1cSJunchao Zhang   cerr = cudaMalloc(&upTriFactorT->solveBuffer,upTriFactorT->solveBufferSize);CHKERRCUDA(cerr);
1185afb2bd1cSJunchao Zhang   #endif
1186afb2bd1cSJunchao Zhang 
1187afb2bd1cSJunchao Zhang   /* perform the solve analysis */
1188aa372e3fSPaul Mullowney   stat = cusparse_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp,
1189afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr,
1190afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(),
1191afb2bd1cSJunchao Zhang                            upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo
11921b0a6780SStefano Zampini                           #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1193afb2bd1cSJunchao Zhang                            ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1194afb2bd1cSJunchao Zhang                           #endif
1195afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1196da79fbbcSStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1197da79fbbcSStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESolveAnalysis,A,0,0,0);CHKERRQ(ierr);
1198aa372e3fSPaul Mullowney 
1199da79fbbcSStefano Zampini   /* assign the pointer */
1200aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSETriFactors*)A->spptr)->upTriFactorPtrTranspose = upTriFactorT;
1201bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1202bda325fcSPaul Mullowney }
1203bda325fcSPaul Mullowney 
1204a49f1ed0SStefano Zampini struct PetscScalarToPetscInt
1205a49f1ed0SStefano Zampini {
1206a49f1ed0SStefano Zampini   __host__ __device__
1207a49f1ed0SStefano Zampini   PetscInt operator()(PetscScalar s)
1208a49f1ed0SStefano Zampini   {
1209a49f1ed0SStefano Zampini     return (PetscInt)PetscRealPart(s);
1210a49f1ed0SStefano Zampini   }
1211a49f1ed0SStefano Zampini };
1212a49f1ed0SStefano Zampini 
12131a2c6b5cSJunchao Zhang static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTransposeForMult(Mat A)
1214bda325fcSPaul Mullowney {
1215aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
1216a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT;
1217bda325fcSPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1218bda325fcSPaul Mullowney   cusparseStatus_t             stat;
1219aa372e3fSPaul Mullowney   cusparseIndexBase_t          indexBase;
1220b06137fdSPaul Mullowney   cudaError_t                  err;
122185ba7357SStefano Zampini   PetscErrorCode               ierr;
1222b175d8bbSPaul Mullowney 
1223bda325fcSPaul Mullowney   PetscFunctionBegin;
12241a2c6b5cSJunchao Zhang   if (!A->form_explicit_transpose || !A->rmap->n || !A->cmap->n) PetscFunctionReturn(0);
1225a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
1226a49f1ed0SStefano Zampini   matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
1227a49f1ed0SStefano Zampini   if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
1228a49f1ed0SStefano Zampini   matstructT = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
12291a2c6b5cSJunchao Zhang   if (A->transupdated && !matstructT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matTranspose struct");
12301a2c6b5cSJunchao Zhang   if (A->transupdated) PetscFunctionReturn(0);
123185ba7357SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1232a49f1ed0SStefano Zampini   if (cusparsestruct->format != MAT_CUSPARSE_CSR) {
1233a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
1234a49f1ed0SStefano Zampini   }
1235a49f1ed0SStefano Zampini   if (!cusparsestruct->matTranspose) { /* create cusparse matrix */
1236aa372e3fSPaul Mullowney     matstructT = new Mat_SeqAIJCUSPARSEMultStruct;
123757d48284SJunchao Zhang     stat = cusparseCreateMatDescr(&matstructT->descr);CHKERRCUSPARSE(stat);
1238aa372e3fSPaul Mullowney     indexBase = cusparseGetMatIndexBase(matstruct->descr);
123957d48284SJunchao Zhang     stat = cusparseSetMatIndexBase(matstructT->descr, indexBase);CHKERRCUSPARSE(stat);
124057d48284SJunchao Zhang     stat = cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
1241aa372e3fSPaul Mullowney 
1242b06137fdSPaul Mullowney     /* set alpha and beta */
1243afb2bd1cSJunchao Zhang     err = cudaMalloc((void **)&(matstructT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
12447656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
12457656d835SStefano Zampini     err = cudaMalloc((void **)&(matstructT->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1246afb2bd1cSJunchao Zhang     err = cudaMemcpy(matstructT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12477656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
12487656d835SStefano Zampini     err = cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
1249b06137fdSPaul Mullowney 
1250aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
1251aa372e3fSPaul Mullowney       CsrMatrix *matrixT = new CsrMatrix;
1252a49f1ed0SStefano Zampini       matstructT->mat = matrixT;
1253554b8892SKarl Rupp       matrixT->num_rows = A->cmap->n;
1254554b8892SKarl Rupp       matrixT->num_cols = A->rmap->n;
1255aa372e3fSPaul Mullowney       matrixT->num_entries = a->nz;
1256a8bd5306SMark Adams       matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows+1);
1257aa372e3fSPaul Mullowney       matrixT->column_indices = new THRUSTINTARRAY32(a->nz);
1258aa372e3fSPaul Mullowney       matrixT->values = new THRUSTARRAY(a->nz);
1259a3fdcf43SKarl Rupp 
1260039c6fbaSStefano Zampini       if (!cusparsestruct->rowoffsets_gpu) { cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); }
126181902715SJunchao Zhang       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i+A->rmap->n+1);
1262afb2bd1cSJunchao Zhang 
1263afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1264afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&matstructT->matDescr,
1265afb2bd1cSJunchao Zhang                                matrixT->num_rows, matrixT->num_cols, matrixT->num_entries,
1266afb2bd1cSJunchao Zhang                                matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(),
1267afb2bd1cSJunchao Zhang                                matrixT->values->data().get(),
1268afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */
1269afb2bd1cSJunchao Zhang                                indexBase,cusparse_scalartype);CHKERRCUSPARSE(stat);
1270afb2bd1cSJunchao Zhang      #endif
1271aa372e3fSPaul Mullowney     } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) {
1272afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1273afb2bd1cSJunchao Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1274afb2bd1cSJunchao Zhang    #else
1275aa372e3fSPaul Mullowney       CsrMatrix *temp  = new CsrMatrix;
127651c6d536SStefano Zampini       CsrMatrix *tempT = new CsrMatrix;
127751c6d536SStefano Zampini       /* First convert HYB to CSR */
1278aa372e3fSPaul Mullowney       temp->num_rows = A->rmap->n;
1279aa372e3fSPaul Mullowney       temp->num_cols = A->cmap->n;
1280aa372e3fSPaul Mullowney       temp->num_entries = a->nz;
1281aa372e3fSPaul Mullowney       temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1282aa372e3fSPaul Mullowney       temp->column_indices = new THRUSTINTARRAY32(a->nz);
1283aa372e3fSPaul Mullowney       temp->values = new THRUSTARRAY(a->nz);
1284aa372e3fSPaul Mullowney 
1285aa372e3fSPaul Mullowney       stat = cusparse_hyb2csr(cusparsestruct->handle,
1286aa372e3fSPaul Mullowney                               matstruct->descr, (cusparseHybMat_t)matstruct->mat,
1287aa372e3fSPaul Mullowney                               temp->values->data().get(),
1288aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
128957d48284SJunchao Zhang                               temp->column_indices->data().get());CHKERRCUSPARSE(stat);
1290aa372e3fSPaul Mullowney 
1291aa372e3fSPaul Mullowney       /* Next, convert CSR to CSC (i.e. the matrix transpose) */
1292aa372e3fSPaul Mullowney       tempT->num_rows = A->rmap->n;
1293aa372e3fSPaul Mullowney       tempT->num_cols = A->cmap->n;
1294aa372e3fSPaul Mullowney       tempT->num_entries = a->nz;
1295aa372e3fSPaul Mullowney       tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n+1);
1296aa372e3fSPaul Mullowney       tempT->column_indices = new THRUSTINTARRAY32(a->nz);
1297aa372e3fSPaul Mullowney       tempT->values = new THRUSTARRAY(a->nz);
1298aa372e3fSPaul Mullowney 
1299aa372e3fSPaul Mullowney       stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows,
1300aa372e3fSPaul Mullowney                               temp->num_cols, temp->num_entries,
1301aa372e3fSPaul Mullowney                               temp->values->data().get(),
1302aa372e3fSPaul Mullowney                               temp->row_offsets->data().get(),
1303aa372e3fSPaul Mullowney                               temp->column_indices->data().get(),
1304aa372e3fSPaul Mullowney                               tempT->values->data().get(),
1305aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
1306aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
130757d48284SJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1308aa372e3fSPaul Mullowney 
1309aa372e3fSPaul Mullowney       /* Last, convert CSC to HYB */
1310aa372e3fSPaul Mullowney       cusparseHybMat_t hybMat;
131157d48284SJunchao Zhang       stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1312aa372e3fSPaul Mullowney       cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1313aa372e3fSPaul Mullowney         CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1314aa372e3fSPaul Mullowney       stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n,
1315aa372e3fSPaul Mullowney                               matstructT->descr, tempT->values->data().get(),
1316aa372e3fSPaul Mullowney                               tempT->row_offsets->data().get(),
1317aa372e3fSPaul Mullowney                               tempT->column_indices->data().get(),
131857d48284SJunchao Zhang                               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1319aa372e3fSPaul Mullowney 
1320aa372e3fSPaul Mullowney       /* assign the pointer */
1321aa372e3fSPaul Mullowney       matstructT->mat = hybMat;
13221a2c6b5cSJunchao Zhang       A->transupdated = PETSC_TRUE;
1323aa372e3fSPaul Mullowney       /* delete temporaries */
1324aa372e3fSPaul Mullowney       if (tempT) {
1325aa372e3fSPaul Mullowney         if (tempT->values) delete (THRUSTARRAY*) tempT->values;
1326aa372e3fSPaul Mullowney         if (tempT->column_indices) delete (THRUSTINTARRAY32*) tempT->column_indices;
1327aa372e3fSPaul Mullowney         if (tempT->row_offsets) delete (THRUSTINTARRAY32*) tempT->row_offsets;
1328aa372e3fSPaul Mullowney         delete (CsrMatrix*) tempT;
1329087f3262SPaul Mullowney       }
1330aa372e3fSPaul Mullowney       if (temp) {
1331aa372e3fSPaul Mullowney         if (temp->values) delete (THRUSTARRAY*) temp->values;
1332aa372e3fSPaul Mullowney         if (temp->column_indices) delete (THRUSTINTARRAY32*) temp->column_indices;
1333aa372e3fSPaul Mullowney         if (temp->row_offsets) delete (THRUSTINTARRAY32*) temp->row_offsets;
1334aa372e3fSPaul Mullowney         delete (CsrMatrix*) temp;
1335aa372e3fSPaul Mullowney       }
1336afb2bd1cSJunchao Zhang      #endif
1337aa372e3fSPaul Mullowney     }
1338a49f1ed0SStefano Zampini   }
1339a49f1ed0SStefano Zampini   if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */
1340a49f1ed0SStefano Zampini     CsrMatrix *matrix  = (CsrMatrix*)matstruct->mat;
1341a49f1ed0SStefano Zampini     CsrMatrix *matrixT = (CsrMatrix*)matstructT->mat;
1342a49f1ed0SStefano Zampini     if (!matrix) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix");
1343a49f1ed0SStefano Zampini     if (!matrix->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix rows");
1344a49f1ed0SStefano Zampini     if (!matrix->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix cols");
1345a49f1ed0SStefano Zampini     if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrix values");
1346a49f1ed0SStefano Zampini     if (!matrixT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT");
1347a49f1ed0SStefano Zampini     if (!matrixT->row_offsets) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT rows");
1348a49f1ed0SStefano Zampini     if (!matrixT->column_indices) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT cols");
1349a49f1ed0SStefano Zampini     if (!matrixT->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CsrMatrixT values");
1350a49f1ed0SStefano Zampini     if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */
1351a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
1352a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
1353a49f1ed0SStefano Zampini       ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
1354a49f1ed0SStefano Zampini     }
1355a49f1ed0SStefano Zampini     if (!cusparsestruct->csr2csc_i) {
1356a49f1ed0SStefano Zampini       THRUSTARRAY csr2csc_a(matrix->num_entries);
1357a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0));
1358a49f1ed0SStefano Zampini 
1359a49f1ed0SStefano Zampini       indexBase = cusparseGetMatIndexBase(matstruct->descr);
1360a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1361a49f1ed0SStefano Zampini       void   *csr2cscBuffer;
1362a49f1ed0SStefano Zampini       size_t csr2cscBufferSize;
1363a49f1ed0SStefano Zampini       stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n,
1364a49f1ed0SStefano Zampini                                            A->cmap->n, matrix->num_entries,
1365a49f1ed0SStefano Zampini                                            matrix->values->data().get(),
1366a49f1ed0SStefano Zampini                                            cusparsestruct->rowoffsets_gpu->data().get(),
1367a49f1ed0SStefano Zampini                                            matrix->column_indices->data().get(),
1368a49f1ed0SStefano Zampini                                            matrixT->values->data().get(),
1369a49f1ed0SStefano Zampini                                            matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1370a49f1ed0SStefano Zampini                                            CUSPARSE_ACTION_NUMERIC,indexBase,
1371a49f1ed0SStefano Zampini                                            cusparsestruct->csr2cscAlg, &csr2cscBufferSize);CHKERRCUSPARSE(stat);
1372a49f1ed0SStefano Zampini       err = cudaMalloc(&csr2cscBuffer,csr2cscBufferSize);CHKERRCUDA(err);
1373a49f1ed0SStefano Zampini      #endif
1374a49f1ed0SStefano Zampini 
13751a2c6b5cSJunchao Zhang       if (matrix->num_entries) {
13761a2c6b5cSJunchao Zhang         /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in
13771a2c6b5cSJunchao Zhang            mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK.
13781a2c6b5cSJunchao Zhang            I checked every parameters and they were just fine. I have no clue why cusparse complains.
13791a2c6b5cSJunchao Zhang 
13801a2c6b5cSJunchao Zhang            Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[]
13811a2c6b5cSJunchao Zhang            should be filled with indexBase. So I just take a shortcut here.
13821a2c6b5cSJunchao Zhang         */
13831a2c6b5cSJunchao Zhang         stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n,
13841a2c6b5cSJunchao Zhang                               A->cmap->n,matrix->num_entries,
13851a2c6b5cSJunchao Zhang                               csr2csc_a.data().get(),
13861a2c6b5cSJunchao Zhang                               cusparsestruct->rowoffsets_gpu->data().get(),
13871a2c6b5cSJunchao Zhang                               matrix->column_indices->data().get(),
1388a49f1ed0SStefano Zampini                               matrixT->values->data().get(),
1389a49f1ed0SStefano Zampini                              #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1390a49f1ed0SStefano Zampini                               matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype,
1391a49f1ed0SStefano Zampini                               CUSPARSE_ACTION_NUMERIC,indexBase,
13921a2c6b5cSJunchao Zhang                               cusparsestruct->csr2cscAlg, csr2cscBuffer);CHKERRCUSPARSE(stat);
1393a49f1ed0SStefano Zampini                              #else
1394a49f1ed0SStefano Zampini                               matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(),
13951a2c6b5cSJunchao Zhang                               CUSPARSE_ACTION_NUMERIC, indexBase);CHKERRCUSPARSE(stat);
1396a49f1ed0SStefano Zampini                              #endif
13971a2c6b5cSJunchao Zhang       } else {
13981a2c6b5cSJunchao Zhang         matrixT->row_offsets->assign(matrixT->row_offsets->size(),indexBase);
13991a2c6b5cSJunchao Zhang       }
14001a2c6b5cSJunchao Zhang 
1401a49f1ed0SStefano Zampini       cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries);
1402a49f1ed0SStefano Zampini       PetscStackCallThrust(thrust::transform(thrust::device,matrixT->values->begin(),matrixT->values->end(),cusparsestruct->csr2csc_i->begin(),PetscScalarToPetscInt()));
1403a49f1ed0SStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1404a49f1ed0SStefano Zampini       err = cudaFree(csr2cscBuffer);CHKERRCUDA(err);
1405a49f1ed0SStefano Zampini      #endif
1406a49f1ed0SStefano Zampini     }
1407a49f1ed0SStefano Zampini     PetscStackCallThrust(thrust::copy(thrust::device,thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()),
1408a49f1ed0SStefano Zampini                                                      thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()),
1409a49f1ed0SStefano Zampini                                                      matrixT->values->begin()));
1410a49f1ed0SStefano Zampini   }
141185ba7357SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose,A,0,0,0);CHKERRQ(ierr);
1412213423ffSJunchao Zhang   /* the compressed row indices is not used for matTranspose */
1413213423ffSJunchao Zhang   matstructT->cprowIndices = NULL;
1414aa372e3fSPaul Mullowney   /* assign the pointer */
1415aa372e3fSPaul Mullowney   ((Mat_SeqAIJCUSPARSE*)A->spptr)->matTranspose = matstructT;
14161a2c6b5cSJunchao Zhang   A->transupdated = PETSC_TRUE;
1417bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1418bda325fcSPaul Mullowney }
1419bda325fcSPaul Mullowney 
1420a49f1ed0SStefano Zampini /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */
14216fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
1422bda325fcSPaul Mullowney {
1423c41cb2e2SAlejandro Lamas Daviña   PetscInt                              n = xx->map->n;
1424465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1425465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1426465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1427465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
1428bda325fcSPaul Mullowney   cusparseStatus_t                      stat;
1429bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1430aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1431aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1432aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1433b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
143457d48284SJunchao Zhang   cudaError_t                           cerr;
1435bda325fcSPaul Mullowney 
1436bda325fcSPaul Mullowney   PetscFunctionBegin;
1437aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1438aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1439bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1440aa372e3fSPaul Mullowney     loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1441aa372e3fSPaul Mullowney     upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1442bda325fcSPaul Mullowney   }
1443bda325fcSPaul Mullowney 
1444bda325fcSPaul Mullowney   /* Get the GPU pointers */
1445c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1446c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1447c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1448c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
1449bda325fcSPaul Mullowney 
14507a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1451aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1452a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1453c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU+n, cusparseTriFactors->rpermIndices->end()),
1454c41cb2e2SAlejandro Lamas Daviña                xGPU);
1455aa372e3fSPaul Mullowney 
1456aa372e3fSPaul Mullowney   /* First, solve U */
1457aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1458afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
14591b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1460afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1461afb2bd1cSJunchao Zhang                       #endif
1462afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1463aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1464aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1465aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1466aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1467afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
14681b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1469afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1470afb2bd1cSJunchao Zhang                       #endif
1471afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1472aa372e3fSPaul Mullowney 
1473aa372e3fSPaul Mullowney   /* Then, solve L */
1474aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1475afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
14761b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1477afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1478afb2bd1cSJunchao Zhang                       #endif
1479afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1480aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1481aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1482aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1483aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1484afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
14851b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1486afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1487afb2bd1cSJunchao Zhang                       #endif
1488afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1489aa372e3fSPaul Mullowney 
1490aa372e3fSPaul Mullowney   /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */
1491a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()),
1492c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(xGPU+n, cusparseTriFactors->cpermIndices->end()),
1493aa372e3fSPaul Mullowney                tempGPU->begin());
1494aa372e3fSPaul Mullowney 
1495aa372e3fSPaul Mullowney   /* Copy the temporary to the full solution. */
1496a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),tempGPU->begin(), tempGPU->end(), xGPU);
1497bda325fcSPaul Mullowney 
1498bda325fcSPaul Mullowney   /* restore */
1499c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1500c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
150105035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1502661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1503958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1504bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1505bda325fcSPaul Mullowney }
1506bda325fcSPaul Mullowney 
15076fa9248bSJed Brown static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
1508bda325fcSPaul Mullowney {
1509465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1510465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
1511bda325fcSPaul Mullowney   cusparseStatus_t                  stat;
1512bda325fcSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1513aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1514aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1515aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1516b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
151757d48284SJunchao Zhang   cudaError_t                       cerr;
1518bda325fcSPaul Mullowney 
1519bda325fcSPaul Mullowney   PetscFunctionBegin;
1520aa372e3fSPaul Mullowney   /* Analyze the matrix and create the transpose ... on the fly */
1521aa372e3fSPaul Mullowney   if (!loTriFactorT && !upTriFactorT) {
1522bda325fcSPaul Mullowney     ierr = MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A);CHKERRQ(ierr);
1523aa372e3fSPaul Mullowney     loTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtrTranspose;
1524aa372e3fSPaul Mullowney     upTriFactorT       = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtrTranspose;
1525bda325fcSPaul Mullowney   }
1526bda325fcSPaul Mullowney 
1527bda325fcSPaul Mullowney   /* Get the GPU pointers */
1528c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1529c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1530bda325fcSPaul Mullowney 
15317a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1532aa372e3fSPaul Mullowney   /* First, solve U */
1533aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactorT->solveOp,
1534afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_rows,
15351b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1536afb2bd1cSJunchao Zhang                         upTriFactorT->csrMat->num_entries,
1537afb2bd1cSJunchao Zhang                       #endif
1538afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactorT->descr,
1539aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->values->data().get(),
1540aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->row_offsets->data().get(),
1541aa372e3fSPaul Mullowney                         upTriFactorT->csrMat->column_indices->data().get(),
1542aa372e3fSPaul Mullowney                         upTriFactorT->solveInfo,
1543afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
15441b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1545afb2bd1cSJunchao Zhang                         ,upTriFactorT->solvePolicy, upTriFactorT->solveBuffer
1546afb2bd1cSJunchao Zhang                       #endif
1547afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1548aa372e3fSPaul Mullowney 
1549aa372e3fSPaul Mullowney   /* Then, solve L */
1550aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactorT->solveOp,
1551afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_rows,
15521b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1553afb2bd1cSJunchao Zhang                         loTriFactorT->csrMat->num_entries,
1554afb2bd1cSJunchao Zhang                       #endif
1555afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactorT->descr,
1556aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->values->data().get(),
1557aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->row_offsets->data().get(),
1558aa372e3fSPaul Mullowney                         loTriFactorT->csrMat->column_indices->data().get(),
1559aa372e3fSPaul Mullowney                         loTriFactorT->solveInfo,
1560afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
15611b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1562afb2bd1cSJunchao Zhang                         ,loTriFactorT->solvePolicy, loTriFactorT->solveBuffer
1563afb2bd1cSJunchao Zhang                       #endif
1564afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1565bda325fcSPaul Mullowney 
1566bda325fcSPaul Mullowney   /* restore */
1567c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1568c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
156905035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1570661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1571958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
1572bda325fcSPaul Mullowney   PetscFunctionReturn(0);
1573bda325fcSPaul Mullowney }
1574bda325fcSPaul Mullowney 
15756fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A,Vec bb,Vec xx)
15769ae82921SPaul Mullowney {
1577465f34aeSAlejandro Lamas Daviña   const PetscScalar                     *barray;
1578465f34aeSAlejandro Lamas Daviña   PetscScalar                           *xarray;
1579465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<const PetscScalar> bGPU;
1580465f34aeSAlejandro Lamas Daviña   thrust::device_ptr<PetscScalar>       xGPU;
15819ae82921SPaul Mullowney   cusparseStatus_t                      stat;
15829ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1583aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1584aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct     *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1585aa372e3fSPaul Mullowney   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1586b175d8bbSPaul Mullowney   PetscErrorCode                        ierr;
158757d48284SJunchao Zhang   cudaError_t                           cerr;
15889ae82921SPaul Mullowney 
15899ae82921SPaul Mullowney   PetscFunctionBegin;
1590ebc8f436SDominic Meiser 
1591e057df02SPaul Mullowney   /* Get the GPU pointers */
1592c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1593c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
1594c41cb2e2SAlejandro Lamas Daviña   xGPU = thrust::device_pointer_cast(xarray);
1595c41cb2e2SAlejandro Lamas Daviña   bGPU = thrust::device_pointer_cast(barray);
15969ae82921SPaul Mullowney 
15977a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1598aa372e3fSPaul Mullowney   /* First, reorder with the row permutation */
1599a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
1600c41cb2e2SAlejandro Lamas Daviña                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
16014e4bbfaaSStefano Zampini                tempGPU->begin());
1602aa372e3fSPaul Mullowney 
1603aa372e3fSPaul Mullowney   /* Next, solve L */
1604aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1605afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16061b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1607afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1608afb2bd1cSJunchao Zhang                       #endif
1609afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1610aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1611aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1612aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1613aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1614afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16151b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1616afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1617afb2bd1cSJunchao Zhang                       #endif
1618afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1619aa372e3fSPaul Mullowney 
1620aa372e3fSPaul Mullowney   /* Then, solve U */
1621aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1622afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16231b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1624afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1625afb2bd1cSJunchao Zhang                       #endif
1626afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1627aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1628aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1629aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1630aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1631afb2bd1cSJunchao Zhang                         xarray, tempGPU->data().get()
16321b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1633afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1634afb2bd1cSJunchao Zhang                       #endif
1635afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1636aa372e3fSPaul Mullowney 
16374e4bbfaaSStefano Zampini   /* Last, reorder with the column permutation */
1638a0e72f99SJunchao Zhang   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
16394e4bbfaaSStefano Zampini                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
16404e4bbfaaSStefano Zampini                xGPU);
16419ae82921SPaul Mullowney 
1642c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1643c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
164405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1645661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1646958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
16479ae82921SPaul Mullowney   PetscFunctionReturn(0);
16489ae82921SPaul Mullowney }
16499ae82921SPaul Mullowney 
16506fa9248bSJed Brown static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A,Vec bb,Vec xx)
16519ae82921SPaul Mullowney {
1652465f34aeSAlejandro Lamas Daviña   const PetscScalar                 *barray;
1653465f34aeSAlejandro Lamas Daviña   PetscScalar                       *xarray;
16549ae82921SPaul Mullowney   cusparseStatus_t                  stat;
16559ae82921SPaul Mullowney   Mat_SeqAIJCUSPARSETriFactors      *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
1656aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->loTriFactorPtr;
1657aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct*)cusparseTriFactors->upTriFactorPtr;
1658aa372e3fSPaul Mullowney   THRUSTARRAY                       *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
1659b175d8bbSPaul Mullowney   PetscErrorCode                    ierr;
166057d48284SJunchao Zhang   cudaError_t                       cerr;
16619ae82921SPaul Mullowney 
16629ae82921SPaul Mullowney   PetscFunctionBegin;
1663e057df02SPaul Mullowney   /* Get the GPU pointers */
1664c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
1665c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
16669ae82921SPaul Mullowney 
16677a052e47Shannah_mairs   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1668aa372e3fSPaul Mullowney   /* First, solve L */
1669aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, loTriFactor->solveOp,
1670afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_rows,
16711b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1672afb2bd1cSJunchao Zhang                         loTriFactor->csrMat->num_entries,
1673afb2bd1cSJunchao Zhang                       #endif
1674afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, loTriFactor->descr,
1675aa372e3fSPaul Mullowney                         loTriFactor->csrMat->values->data().get(),
1676aa372e3fSPaul Mullowney                         loTriFactor->csrMat->row_offsets->data().get(),
1677aa372e3fSPaul Mullowney                         loTriFactor->csrMat->column_indices->data().get(),
1678aa372e3fSPaul Mullowney                         loTriFactor->solveInfo,
1679afb2bd1cSJunchao Zhang                         barray, tempGPU->data().get()
16801b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1681afb2bd1cSJunchao Zhang                         ,loTriFactor->solvePolicy, loTriFactor->solveBuffer
1682afb2bd1cSJunchao Zhang                       #endif
1683afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
1684aa372e3fSPaul Mullowney 
1685aa372e3fSPaul Mullowney   /* Next, solve U */
1686aa372e3fSPaul Mullowney   stat = cusparse_solve(cusparseTriFactors->handle, upTriFactor->solveOp,
1687afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_rows,
16881b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1689afb2bd1cSJunchao Zhang                         upTriFactor->csrMat->num_entries,
1690afb2bd1cSJunchao Zhang                       #endif
1691afb2bd1cSJunchao Zhang                         &PETSC_CUSPARSE_ONE, upTriFactor->descr,
1692aa372e3fSPaul Mullowney                         upTriFactor->csrMat->values->data().get(),
1693aa372e3fSPaul Mullowney                         upTriFactor->csrMat->row_offsets->data().get(),
1694aa372e3fSPaul Mullowney                         upTriFactor->csrMat->column_indices->data().get(),
1695aa372e3fSPaul Mullowney                         upTriFactor->solveInfo,
1696afb2bd1cSJunchao Zhang                         tempGPU->data().get(), xarray
16971b0a6780SStefano Zampini                       #if PETSC_PKG_CUDA_VERSION_GE(9,0,0)
1698afb2bd1cSJunchao Zhang                         ,upTriFactor->solvePolicy, upTriFactor->solveBuffer
1699afb2bd1cSJunchao Zhang                       #endif
1700afb2bd1cSJunchao Zhang );CHKERRCUSPARSE(stat);
17019ae82921SPaul Mullowney 
1702c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
1703c41cb2e2SAlejandro Lamas Daviña   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
170405035670SJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1705661c2d29Shannah_mairs   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1706958c4211Shannah_mairs   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
17079ae82921SPaul Mullowney   PetscFunctionReturn(0);
17089ae82921SPaul Mullowney }
17099ae82921SPaul Mullowney 
17107e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A)
17117e8381f9SStefano Zampini {
17127e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
17137e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
17147e8381f9SStefano Zampini   cudaError_t        cerr;
17157e8381f9SStefano Zampini   PetscErrorCode     ierr;
17167e8381f9SStefano Zampini 
17177e8381f9SStefano Zampini   PetscFunctionBegin;
17187e8381f9SStefano Zampini   if (A->offloadmask == PETSC_OFFLOAD_GPU) {
17197e8381f9SStefano Zampini     CsrMatrix *matrix = (CsrMatrix*)cusp->mat->mat;
17207e8381f9SStefano Zampini 
17217e8381f9SStefano Zampini     ierr = PetscLogEventBegin(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17227e8381f9SStefano Zampini     cerr = cudaMemcpy(a->a, matrix->values->data().get(), a->nz*sizeof(PetscScalar), cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
17237e8381f9SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
17247e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu(a->nz*sizeof(PetscScalar));CHKERRQ(ierr);
17257e8381f9SStefano Zampini     ierr = PetscLogEventEnd(MAT_CUSPARSECopyFromGPU,A,0,0,0);CHKERRQ(ierr);
17267e8381f9SStefano Zampini     A->offloadmask = PETSC_OFFLOAD_BOTH;
17277e8381f9SStefano Zampini   }
17287e8381f9SStefano Zampini   PetscFunctionReturn(0);
17297e8381f9SStefano Zampini }
17307e8381f9SStefano Zampini 
17317e8381f9SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A,PetscScalar *array[])
17327e8381f9SStefano Zampini {
17337e8381f9SStefano Zampini   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
17347e8381f9SStefano Zampini   PetscErrorCode ierr;
17357e8381f9SStefano Zampini 
17367e8381f9SStefano Zampini   PetscFunctionBegin;
17377e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
17387e8381f9SStefano Zampini   *array = a->a;
17397e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
17407e8381f9SStefano Zampini   PetscFunctionReturn(0);
17417e8381f9SStefano Zampini }
17427e8381f9SStefano Zampini 
17436fa9248bSJed Brown static PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A)
17449ae82921SPaul Mullowney {
1745aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
17467c700b8dSJunchao Zhang   Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat;
17479ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
1748213423ffSJunchao Zhang   PetscInt                     m = A->rmap->n,*ii,*ridx,tmp;
17499ae82921SPaul Mullowney   PetscErrorCode               ierr;
1750aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
1751abb89eb1SStefano Zampini   PetscBool                    both = PETSC_TRUE;
1752b06137fdSPaul Mullowney   cudaError_t                  err;
17539ae82921SPaul Mullowney 
17549ae82921SPaul Mullowney   PetscFunctionBegin;
1755fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Cannot copy to GPU");
1756c70f7ee4SJunchao Zhang   if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) {
1757a49f1ed0SStefano Zampini     if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */
1758a49f1ed0SStefano Zampini       CsrMatrix *matrix;
1759afb2bd1cSJunchao Zhang       matrix = (CsrMatrix*)cusparsestruct->mat->mat;
176085ba7357SStefano Zampini 
1761abb89eb1SStefano Zampini       if (a->nz && !a->a) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR values");
176285ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1763afb2bd1cSJunchao Zhang       matrix->values->assign(a->a, a->a+a->nz);
176405035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
17654863603aSSatish Balay       ierr = PetscLogCpuToGpu((a->nz)*sizeof(PetscScalar));CHKERRQ(ierr);
176685ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
1767a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
176834d6c7a5SJose E. Roman     } else {
1769abb89eb1SStefano Zampini       PetscInt nnz;
177085ba7357SStefano Zampini       ierr = PetscLogEventBegin(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
17717c700b8dSJunchao Zhang       ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat,cusparsestruct->format);CHKERRQ(ierr);
1772a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
17737c700b8dSJunchao Zhang       delete cusparsestruct->workVector;
177481902715SJunchao Zhang       delete cusparsestruct->rowoffsets_gpu;
1775a49f1ed0SStefano Zampini       cusparsestruct->workVector = NULL;
1776a49f1ed0SStefano Zampini       cusparsestruct->rowoffsets_gpu = NULL;
17779ae82921SPaul Mullowney       try {
17789ae82921SPaul Mullowney         if (a->compressedrow.use) {
17799ae82921SPaul Mullowney           m    = a->compressedrow.nrows;
17809ae82921SPaul Mullowney           ii   = a->compressedrow.i;
17819ae82921SPaul Mullowney           ridx = a->compressedrow.rindex;
17829ae82921SPaul Mullowney         } else {
1783213423ffSJunchao Zhang           m    = A->rmap->n;
1784213423ffSJunchao Zhang           ii   = a->i;
1785e6e9a74fSStefano Zampini           ridx = NULL;
17869ae82921SPaul Mullowney         }
1787abb89eb1SStefano Zampini         if (!ii) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR row data");
1788abb89eb1SStefano Zampini         if (m && !a->j) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing CSR column data");
1789abb89eb1SStefano Zampini         if (!a->a) { nnz = ii[m]; both = PETSC_FALSE; }
1790abb89eb1SStefano Zampini         else nnz = a->nz;
17919ae82921SPaul Mullowney 
179285ba7357SStefano Zampini         /* create cusparse matrix */
1793abb89eb1SStefano Zampini         cusparsestruct->nrows = m;
1794aa372e3fSPaul Mullowney         matstruct = new Mat_SeqAIJCUSPARSEMultStruct;
179557d48284SJunchao Zhang         stat = cusparseCreateMatDescr(&matstruct->descr);CHKERRCUSPARSE(stat);
179657d48284SJunchao Zhang         stat = cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
179757d48284SJunchao Zhang         stat = cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
17989ae82921SPaul Mullowney 
1799afb2bd1cSJunchao Zhang         err = cudaMalloc((void **)&(matstruct->alpha_one),sizeof(PetscScalar));CHKERRCUDA(err);
18007656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_zero),sizeof(PetscScalar));CHKERRCUDA(err);
18017656d835SStefano Zampini         err = cudaMalloc((void **)&(matstruct->beta_one), sizeof(PetscScalar));CHKERRCUDA(err);
1802afb2bd1cSJunchao Zhang         err = cudaMemcpy(matstruct->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18037656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
18047656d835SStefano Zampini         err = cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(err);
180557d48284SJunchao Zhang         stat = cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
1806b06137fdSPaul Mullowney 
1807aa372e3fSPaul Mullowney         /* Build a hybrid/ellpack matrix if this option is chosen for the storage */
1808aa372e3fSPaul Mullowney         if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
1809aa372e3fSPaul Mullowney           /* set the matrix */
1810afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1811afb2bd1cSJunchao Zhang           mat->num_rows = m;
1812afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1813abb89eb1SStefano Zampini           mat->num_entries = nnz;
1814afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1815afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
18169ae82921SPaul Mullowney 
1817abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1818abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1819aa372e3fSPaul Mullowney 
1820abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1821abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1822aa372e3fSPaul Mullowney 
1823aa372e3fSPaul Mullowney           /* assign the pointer */
1824afb2bd1cSJunchao Zhang           matstruct->mat = mat;
1825afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1826afb2bd1cSJunchao Zhang           if (mat->num_rows) { /* cusparse errors on empty matrices! */
1827afb2bd1cSJunchao Zhang             stat = cusparseCreateCsr(&matstruct->matDescr,
1828afb2bd1cSJunchao Zhang                                     mat->num_rows, mat->num_cols, mat->num_entries,
1829afb2bd1cSJunchao Zhang                                     mat->row_offsets->data().get(), mat->column_indices->data().get(),
1830afb2bd1cSJunchao Zhang                                     mat->values->data().get(),
1831afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
1832afb2bd1cSJunchao Zhang                                     CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
1833afb2bd1cSJunchao Zhang           }
1834afb2bd1cSJunchao Zhang          #endif
1835aa372e3fSPaul Mullowney         } else if (cusparsestruct->format==MAT_CUSPARSE_ELL || cusparsestruct->format==MAT_CUSPARSE_HYB) {
1836afb2bd1cSJunchao Zhang          #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1837afb2bd1cSJunchao Zhang           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
1838afb2bd1cSJunchao Zhang          #else
1839afb2bd1cSJunchao Zhang           CsrMatrix *mat= new CsrMatrix;
1840afb2bd1cSJunchao Zhang           mat->num_rows = m;
1841afb2bd1cSJunchao Zhang           mat->num_cols = A->cmap->n;
1842abb89eb1SStefano Zampini           mat->num_entries = nnz;
1843afb2bd1cSJunchao Zhang           mat->row_offsets = new THRUSTINTARRAY32(m+1);
1844afb2bd1cSJunchao Zhang           mat->row_offsets->assign(ii, ii + m+1);
1845aa372e3fSPaul Mullowney 
1846abb89eb1SStefano Zampini           mat->column_indices = new THRUSTINTARRAY32(nnz);
1847abb89eb1SStefano Zampini           mat->column_indices->assign(a->j, a->j+nnz);
1848aa372e3fSPaul Mullowney 
1849abb89eb1SStefano Zampini           mat->values = new THRUSTARRAY(nnz);
1850abb89eb1SStefano Zampini           if (a->a) mat->values->assign(a->a, a->a+nnz);
1851aa372e3fSPaul Mullowney 
1852aa372e3fSPaul Mullowney           cusparseHybMat_t hybMat;
185357d48284SJunchao Zhang           stat = cusparseCreateHybMat(&hybMat);CHKERRCUSPARSE(stat);
1854aa372e3fSPaul Mullowney           cusparseHybPartition_t partition = cusparsestruct->format==MAT_CUSPARSE_ELL ?
1855aa372e3fSPaul Mullowney             CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO;
1856afb2bd1cSJunchao Zhang           stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols,
1857afb2bd1cSJunchao Zhang               matstruct->descr, mat->values->data().get(),
1858afb2bd1cSJunchao Zhang               mat->row_offsets->data().get(),
1859afb2bd1cSJunchao Zhang               mat->column_indices->data().get(),
186057d48284SJunchao Zhang               hybMat, 0, partition);CHKERRCUSPARSE(stat);
1861aa372e3fSPaul Mullowney           /* assign the pointer */
1862aa372e3fSPaul Mullowney           matstruct->mat = hybMat;
1863aa372e3fSPaul Mullowney 
1864afb2bd1cSJunchao Zhang           if (mat) {
1865afb2bd1cSJunchao Zhang             if (mat->values) delete (THRUSTARRAY*)mat->values;
1866afb2bd1cSJunchao Zhang             if (mat->column_indices) delete (THRUSTINTARRAY32*)mat->column_indices;
1867afb2bd1cSJunchao Zhang             if (mat->row_offsets) delete (THRUSTINTARRAY32*)mat->row_offsets;
1868afb2bd1cSJunchao Zhang             delete (CsrMatrix*)mat;
1869087f3262SPaul Mullowney           }
1870afb2bd1cSJunchao Zhang          #endif
1871087f3262SPaul Mullowney         }
1872ca45077fSPaul Mullowney 
1873aa372e3fSPaul Mullowney         /* assign the compressed row indices */
1874213423ffSJunchao Zhang         if (a->compressedrow.use) {
1875213423ffSJunchao Zhang           cusparsestruct->workVector = new THRUSTARRAY(m);
1876aa372e3fSPaul Mullowney           matstruct->cprowIndices    = new THRUSTINTARRAY(m);
1877aa372e3fSPaul Mullowney           matstruct->cprowIndices->assign(ridx,ridx+m);
1878213423ffSJunchao Zhang           tmp = m;
1879213423ffSJunchao Zhang         } else {
1880213423ffSJunchao Zhang           cusparsestruct->workVector = NULL;
1881213423ffSJunchao Zhang           matstruct->cprowIndices    = NULL;
1882213423ffSJunchao Zhang           tmp = 0;
1883213423ffSJunchao Zhang         }
1884213423ffSJunchao Zhang         ierr = PetscLogCpuToGpu(((m+1)+(a->nz))*sizeof(int)+tmp*sizeof(PetscInt)+(3+(a->nz))*sizeof(PetscScalar));CHKERRQ(ierr);
1885aa372e3fSPaul Mullowney 
1886aa372e3fSPaul Mullowney         /* assign the pointer */
1887aa372e3fSPaul Mullowney         cusparsestruct->mat = matstruct;
18889ae82921SPaul Mullowney       } catch(char *ex) {
18899ae82921SPaul Mullowney         SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
18909ae82921SPaul Mullowney       }
189105035670SJunchao Zhang       err  = WaitForCUDA();CHKERRCUDA(err);
189285ba7357SStefano Zampini       ierr = PetscLogEventEnd(MAT_CUSPARSECopyToGPU,A,0,0,0);CHKERRQ(ierr);
189334d6c7a5SJose E. Roman       cusparsestruct->nonzerostate = A->nonzerostate;
189434d6c7a5SJose E. Roman     }
1895abb89eb1SStefano Zampini     if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
18969ae82921SPaul Mullowney   }
18979ae82921SPaul Mullowney   PetscFunctionReturn(0);
18989ae82921SPaul Mullowney }
18999ae82921SPaul Mullowney 
1900c41cb2e2SAlejandro Lamas Daviña struct VecCUDAPlusEquals
1901aa372e3fSPaul Mullowney {
1902aa372e3fSPaul Mullowney   template <typename Tuple>
1903aa372e3fSPaul Mullowney   __host__ __device__
1904aa372e3fSPaul Mullowney   void operator()(Tuple t)
1905aa372e3fSPaul Mullowney   {
1906aa372e3fSPaul Mullowney     thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t);
1907aa372e3fSPaul Mullowney   }
1908aa372e3fSPaul Mullowney };
1909aa372e3fSPaul Mullowney 
19107e8381f9SStefano Zampini struct VecCUDAEquals
19117e8381f9SStefano Zampini {
19127e8381f9SStefano Zampini   template <typename Tuple>
19137e8381f9SStefano Zampini   __host__ __device__
19147e8381f9SStefano Zampini   void operator()(Tuple t)
19157e8381f9SStefano Zampini   {
19167e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
19177e8381f9SStefano Zampini   }
19187e8381f9SStefano Zampini };
19197e8381f9SStefano Zampini 
1920e6e9a74fSStefano Zampini struct VecCUDAEqualsReverse
1921e6e9a74fSStefano Zampini {
1922e6e9a74fSStefano Zampini   template <typename Tuple>
1923e6e9a74fSStefano Zampini   __host__ __device__
1924e6e9a74fSStefano Zampini   void operator()(Tuple t)
1925e6e9a74fSStefano Zampini   {
1926e6e9a74fSStefano Zampini     thrust::get<0>(t) = thrust::get<1>(t);
1927e6e9a74fSStefano Zampini   }
1928e6e9a74fSStefano Zampini };
1929e6e9a74fSStefano Zampini 
1930afb2bd1cSJunchao Zhang struct MatMatCusparse {
1931ccdfe979SStefano Zampini   PetscBool             cisdense;
1932ccdfe979SStefano Zampini   PetscScalar           *Bt;
1933ccdfe979SStefano Zampini   Mat                   X;
1934fcdce8c4SStefano Zampini   PetscBool             reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */
1935fcdce8c4SStefano Zampini   PetscLogDouble        flops;
1936fcdce8c4SStefano Zampini   CsrMatrix             *Bcsr;
1937afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1938fcdce8c4SStefano Zampini   cusparseSpMatDescr_t  matSpBDescr;
1939afb2bd1cSJunchao Zhang   PetscBool             initialized;   /* C = alpha op(A) op(B) + beta C */
1940afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matBDescr;
1941afb2bd1cSJunchao Zhang   cusparseDnMatDescr_t  matCDescr;
1942afb2bd1cSJunchao Zhang   PetscInt              Blda,Clda; /* Record leading dimensions of B and C here to detect changes*/
1943fcdce8c4SStefano Zampini   size_t                mmBufferSize;
1944fcdce8c4SStefano Zampini   void                  *mmBuffer;
1945fcdce8c4SStefano Zampini   void                  *mmBuffer2; /* SpGEMM WorkEstimation buffer */
1946fcdce8c4SStefano Zampini   cusparseSpGEMMDescr_t spgemmDesc;
1947afb2bd1cSJunchao Zhang #endif
1948afb2bd1cSJunchao Zhang };
1949ccdfe979SStefano Zampini 
1950ccdfe979SStefano Zampini static PetscErrorCode MatDestroy_MatMatCusparse(void *data)
1951ccdfe979SStefano Zampini {
1952ccdfe979SStefano Zampini   PetscErrorCode   ierr;
1953ccdfe979SStefano Zampini   MatMatCusparse   *mmdata = (MatMatCusparse *)data;
1954ccdfe979SStefano Zampini   cudaError_t      cerr;
1955fcdce8c4SStefano Zampini  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1956fcdce8c4SStefano Zampini   cusparseStatus_t stat;
1957fcdce8c4SStefano Zampini  #endif
1958ccdfe979SStefano Zampini 
1959ccdfe979SStefano Zampini   PetscFunctionBegin;
1960ccdfe979SStefano Zampini   cerr = cudaFree(mmdata->Bt);CHKERRCUDA(cerr);
1961fcdce8c4SStefano Zampini   delete mmdata->Bcsr;
1962afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
1963fcdce8c4SStefano Zampini   if (mmdata->matSpBDescr) { stat = cusparseDestroySpMat(mmdata->matSpBDescr);CHKERRCUSPARSE(stat); }
1964fcdce8c4SStefano Zampini   if (mmdata->mmBuffer)    { cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr); }
1965fcdce8c4SStefano Zampini   if (mmdata->mmBuffer2)   { cerr = cudaFree(mmdata->mmBuffer2);CHKERRCUDA(cerr); }
1966afb2bd1cSJunchao Zhang   if (mmdata->matBDescr)   { stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); }
1967afb2bd1cSJunchao Zhang   if (mmdata->matCDescr)   { stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); }
1968fcdce8c4SStefano Zampini   if (mmdata->spgemmDesc)  { stat = cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc);CHKERRCUSPARSE(stat); }
1969afb2bd1cSJunchao Zhang  #endif
1970ccdfe979SStefano Zampini   ierr = MatDestroy(&mmdata->X);CHKERRQ(ierr);
1971ccdfe979SStefano Zampini   ierr = PetscFree(data);CHKERRQ(ierr);
1972ccdfe979SStefano Zampini   PetscFunctionReturn(0);
1973ccdfe979SStefano Zampini }
1974ccdfe979SStefano Zampini 
1975ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(Mat,Mat,Mat,PetscBool,PetscBool);
1976ccdfe979SStefano Zampini 
1977ccdfe979SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
1978ccdfe979SStefano Zampini {
1979ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
1980ccdfe979SStefano Zampini   Mat                          A,B;
1981afb2bd1cSJunchao Zhang   PetscInt                     m,n,blda,clda;
1982ccdfe979SStefano Zampini   PetscBool                    flg,biscuda;
1983ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE           *cusp;
1984ccdfe979SStefano Zampini   cusparseStatus_t             stat;
1985ccdfe979SStefano Zampini   cusparseOperation_t          opA;
1986ccdfe979SStefano Zampini   const PetscScalar            *barray;
1987ccdfe979SStefano Zampini   PetscScalar                  *carray;
1988ccdfe979SStefano Zampini   PetscErrorCode               ierr;
1989ccdfe979SStefano Zampini   MatMatCusparse               *mmdata;
1990ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *mat;
1991ccdfe979SStefano Zampini   CsrMatrix                    *csrmat;
1992afb2bd1cSJunchao Zhang   cudaError_t                  cerr;
1993ccdfe979SStefano Zampini 
1994ccdfe979SStefano Zampini   PetscFunctionBegin;
1995ccdfe979SStefano Zampini   MatCheckProduct(C,1);
1996ccdfe979SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
1997ccdfe979SStefano Zampini   mmdata = (MatMatCusparse*)product->data;
1998ccdfe979SStefano Zampini   A    = product->A;
1999ccdfe979SStefano Zampini   B    = product->B;
2000ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2001ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2002ccdfe979SStefano Zampini   /* currently CopyToGpu does not copy if the matrix is bound to CPU
2003ccdfe979SStefano Zampini      Instead of silently accepting the wrong answer, I prefer to raise the error */
2004ccdfe979SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2005ccdfe979SStefano Zampini   ierr   = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2006ccdfe979SStefano Zampini   cusp   = (Mat_SeqAIJCUSPARSE*)A->spptr;
2007ccdfe979SStefano Zampini   switch (product->type) {
2008ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2009ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2010ccdfe979SStefano Zampini     mat = cusp->mat;
2011ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2012ccdfe979SStefano Zampini     m   = A->rmap->n;
2013ccdfe979SStefano Zampini     n   = B->cmap->n;
2014ccdfe979SStefano Zampini     break;
2015ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
20161a2c6b5cSJunchao Zhang     if (!A->form_explicit_transpose) {
2017e6e9a74fSStefano Zampini       mat = cusp->mat;
2018e6e9a74fSStefano Zampini       opA = CUSPARSE_OPERATION_TRANSPOSE;
2019e6e9a74fSStefano Zampini     } else {
20201a2c6b5cSJunchao Zhang       ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2021ccdfe979SStefano Zampini       mat  = cusp->matTranspose;
2022ccdfe979SStefano Zampini       opA  = CUSPARSE_OPERATION_NON_TRANSPOSE;
2023e6e9a74fSStefano Zampini     }
2024ccdfe979SStefano Zampini     m = A->cmap->n;
2025ccdfe979SStefano Zampini     n = B->cmap->n;
2026ccdfe979SStefano Zampini     break;
2027ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2028ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2029ccdfe979SStefano Zampini     mat = cusp->mat;
2030ccdfe979SStefano Zampini     opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2031ccdfe979SStefano Zampini     m   = A->rmap->n;
2032ccdfe979SStefano Zampini     n   = B->rmap->n;
2033ccdfe979SStefano Zampini     break;
2034ccdfe979SStefano Zampini   default:
2035ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2036ccdfe979SStefano Zampini   }
2037ccdfe979SStefano Zampini   if (!mat) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct");
2038ccdfe979SStefano Zampini   csrmat = (CsrMatrix*)mat->mat;
2039ccdfe979SStefano Zampini   /* if the user passed a CPU matrix, copy the data to the GPU */
2040ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSECUDA,&biscuda);CHKERRQ(ierr);
2041afb2bd1cSJunchao Zhang   if (!biscuda) {ierr = MatConvert(B,MATSEQDENSECUDA,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);}
2042ccdfe979SStefano Zampini   ierr = MatDenseCUDAGetArrayRead(B,&barray);CHKERRQ(ierr);
2043afb2bd1cSJunchao Zhang 
2044ccdfe979SStefano Zampini   ierr = MatDenseGetLDA(B,&blda);CHKERRQ(ierr);
2045c8378d12SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2046c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2047c8378d12SStefano Zampini     ierr = MatDenseGetLDA(mmdata->X,&clda);CHKERRQ(ierr);
2048c8378d12SStefano Zampini   } else {
2049c8378d12SStefano Zampini     ierr = MatDenseCUDAGetArrayWrite(C,&carray);CHKERRQ(ierr);
2050c8378d12SStefano Zampini     ierr = MatDenseGetLDA(C,&clda);CHKERRQ(ierr);
2051c8378d12SStefano Zampini   }
2052c8378d12SStefano Zampini 
2053c8378d12SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2054afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2055afb2bd1cSJunchao Zhang   cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
2056fcdce8c4SStefano Zampini   /* (re)allcoate mmBuffer if not initialized or LDAs are different */
2057afb2bd1cSJunchao Zhang   if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) {
2058fcdce8c4SStefano Zampini     size_t mmBufferSize;
2059afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Blda != blda) {stat = cusparseDestroyDnMat(mmdata->matBDescr);CHKERRCUSPARSE(stat); mmdata->matBDescr = NULL;}
2060afb2bd1cSJunchao Zhang     if (!mmdata->matBDescr) {
2061afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matBDescr,B->rmap->n,B->cmap->n,blda,(void*)barray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2062afb2bd1cSJunchao Zhang       mmdata->Blda = blda;
2063afb2bd1cSJunchao Zhang     }
2064c8378d12SStefano Zampini 
2065afb2bd1cSJunchao Zhang     if (mmdata->initialized && mmdata->Clda != clda) {stat = cusparseDestroyDnMat(mmdata->matCDescr);CHKERRCUSPARSE(stat); mmdata->matCDescr = NULL;}
2066afb2bd1cSJunchao Zhang     if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */
2067afb2bd1cSJunchao Zhang       stat         = cusparseCreateDnMat(&mmdata->matCDescr,m,n,clda,(void*)carray,cusparse_scalartype,CUSPARSE_ORDER_COL);CHKERRCUSPARSE(stat);
2068afb2bd1cSJunchao Zhang       mmdata->Clda = clda;
2069afb2bd1cSJunchao Zhang     }
2070afb2bd1cSJunchao Zhang 
2071afb2bd1cSJunchao Zhang     if (!mat->matDescr) {
2072afb2bd1cSJunchao Zhang       stat = cusparseCreateCsr(&mat->matDescr,
2073afb2bd1cSJunchao Zhang                                csrmat->num_rows, csrmat->num_cols, csrmat->num_entries,
2074afb2bd1cSJunchao Zhang                                csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(),
2075afb2bd1cSJunchao Zhang                                csrmat->values->data().get(),
2076afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_32I,CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */
2077afb2bd1cSJunchao Zhang                                CUSPARSE_INDEX_BASE_ZERO,cusparse_scalartype);CHKERRCUSPARSE(stat);
2078afb2bd1cSJunchao Zhang     }
2079afb2bd1cSJunchao Zhang     stat = cusparseSpMM_bufferSize(cusp->handle,opA,opB,mat->alpha_one,
2080afb2bd1cSJunchao Zhang                                    mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2081afb2bd1cSJunchao Zhang                                    mmdata->matCDescr,cusparse_scalartype,
2082fcdce8c4SStefano Zampini                                    cusp->spmmAlg,&mmBufferSize);CHKERRCUSPARSE(stat);
2083fcdce8c4SStefano Zampini     if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) {
2084fcdce8c4SStefano Zampini       cerr = cudaFree(mmdata->mmBuffer);CHKERRCUDA(cerr);
2085fcdce8c4SStefano Zampini       cerr = cudaMalloc(&mmdata->mmBuffer,mmBufferSize);CHKERRCUDA(cerr);
2086fcdce8c4SStefano Zampini       mmdata->mmBufferSize = mmBufferSize;
2087fcdce8c4SStefano Zampini     }
2088afb2bd1cSJunchao Zhang     mmdata->initialized = PETSC_TRUE;
2089afb2bd1cSJunchao Zhang   } else {
2090afb2bd1cSJunchao Zhang     /* to be safe, always update pointers of the mats */
2091afb2bd1cSJunchao Zhang     stat = cusparseSpMatSetValues(mat->matDescr,csrmat->values->data().get());CHKERRCUSPARSE(stat);
2092afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matBDescr,(void*)barray);CHKERRCUSPARSE(stat);
2093afb2bd1cSJunchao Zhang     stat = cusparseDnMatSetValues(mmdata->matCDescr,(void*)carray);CHKERRCUSPARSE(stat);
2094afb2bd1cSJunchao Zhang   }
2095afb2bd1cSJunchao Zhang 
2096afb2bd1cSJunchao Zhang   /* do cusparseSpMM, which supports transpose on B */
2097afb2bd1cSJunchao Zhang   stat = cusparseSpMM(cusp->handle,opA,opB,mat->alpha_one,
2098afb2bd1cSJunchao Zhang                       mat->matDescr,mmdata->matBDescr,mat->beta_zero,
2099afb2bd1cSJunchao Zhang                       mmdata->matCDescr,cusparse_scalartype,
2100fcdce8c4SStefano Zampini                       cusp->spmmAlg,mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2101afb2bd1cSJunchao Zhang  #else
2102afb2bd1cSJunchao Zhang   PetscInt k;
2103afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B */
2104ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2105ccdfe979SStefano Zampini     cublasHandle_t cublasv2handle;
2106ccdfe979SStefano Zampini     cublasStatus_t cerr;
2107ccdfe979SStefano Zampini 
2108ccdfe979SStefano Zampini     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
2109ccdfe979SStefano Zampini     cerr = cublasXgeam(cublasv2handle,CUBLAS_OP_T,CUBLAS_OP_T,
2110ccdfe979SStefano Zampini                        B->cmap->n,B->rmap->n,
2111ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ONE ,barray,blda,
2112ccdfe979SStefano Zampini                        &PETSC_CUSPARSE_ZERO,barray,blda,
2113ccdfe979SStefano Zampini                        mmdata->Bt,B->cmap->n);CHKERRCUBLAS(cerr);
2114ccdfe979SStefano Zampini     blda = B->cmap->n;
2115afb2bd1cSJunchao Zhang     k    = B->cmap->n;
2116afb2bd1cSJunchao Zhang   } else {
2117afb2bd1cSJunchao Zhang     k    = B->rmap->n;
2118ccdfe979SStefano Zampini   }
2119ccdfe979SStefano Zampini 
2120afb2bd1cSJunchao Zhang   /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */
2121ccdfe979SStefano Zampini   stat = cusparse_csr_spmm(cusp->handle,opA,m,n,k,
2122afb2bd1cSJunchao Zhang                            csrmat->num_entries,mat->alpha_one,mat->descr,
2123ccdfe979SStefano Zampini                            csrmat->values->data().get(),
2124ccdfe979SStefano Zampini                            csrmat->row_offsets->data().get(),
2125ccdfe979SStefano Zampini                            csrmat->column_indices->data().get(),
2126ccdfe979SStefano Zampini                            mmdata->Bt ? mmdata->Bt : barray,blda,mat->beta_zero,
2127ccdfe979SStefano Zampini                            carray,clda);CHKERRCUSPARSE(stat);
2128afb2bd1cSJunchao Zhang  #endif
2129afb2bd1cSJunchao Zhang   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2130c8378d12SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2131c8378d12SStefano Zampini   ierr = PetscLogGpuFlops(n*2.0*csrmat->num_entries);CHKERRQ(ierr);
2132ccdfe979SStefano Zampini   ierr = MatDenseCUDARestoreArrayRead(B,&barray);CHKERRQ(ierr);
2133ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt) {
2134ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2135ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2136ccdfe979SStefano Zampini   } else if (product->type == MATPRODUCT_PtAP) {
2137ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(mmdata->X,&carray);CHKERRQ(ierr);
2138ccdfe979SStefano Zampini     ierr = MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Private(B,mmdata->X,C,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2139ccdfe979SStefano Zampini   } else {
2140ccdfe979SStefano Zampini     ierr = MatDenseCUDARestoreArrayWrite(C,&carray);CHKERRQ(ierr);
2141ccdfe979SStefano Zampini   }
2142ccdfe979SStefano Zampini   if (mmdata->cisdense) {
2143ccdfe979SStefano Zampini     ierr = MatConvert(C,MATSEQDENSE,MAT_INPLACE_MATRIX,&C);CHKERRQ(ierr);
2144ccdfe979SStefano Zampini   }
2145ccdfe979SStefano Zampini   if (!biscuda) {
2146ccdfe979SStefano Zampini     ierr = MatConvert(B,MATSEQDENSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
2147ccdfe979SStefano Zampini   }
2148ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2149ccdfe979SStefano Zampini }
2150ccdfe979SStefano Zampini 
2151ccdfe979SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C)
2152ccdfe979SStefano Zampini {
2153ccdfe979SStefano Zampini   Mat_Product        *product = C->product;
2154ccdfe979SStefano Zampini   Mat                A,B;
2155ccdfe979SStefano Zampini   PetscInt           m,n;
2156ccdfe979SStefano Zampini   PetscBool          cisdense,flg;
2157ccdfe979SStefano Zampini   PetscErrorCode     ierr;
2158ccdfe979SStefano Zampini   MatMatCusparse     *mmdata;
2159ccdfe979SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp;
2160ccdfe979SStefano Zampini 
2161ccdfe979SStefano Zampini   PetscFunctionBegin;
2162ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2163ccdfe979SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2164ccdfe979SStefano Zampini   A    = product->A;
2165ccdfe979SStefano Zampini   B    = product->B;
2166ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2167ccdfe979SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2168ccdfe979SStefano Zampini   cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2169ccdfe979SStefano Zampini   if (cusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2170ccdfe979SStefano Zampini   switch (product->type) {
2171ccdfe979SStefano Zampini   case MATPRODUCT_AB:
2172ccdfe979SStefano Zampini     m = A->rmap->n;
2173ccdfe979SStefano Zampini     n = B->cmap->n;
2174ccdfe979SStefano Zampini     break;
2175ccdfe979SStefano Zampini   case MATPRODUCT_AtB:
2176ccdfe979SStefano Zampini     m = A->cmap->n;
2177ccdfe979SStefano Zampini     n = B->cmap->n;
2178ccdfe979SStefano Zampini     break;
2179ccdfe979SStefano Zampini   case MATPRODUCT_ABt:
2180ccdfe979SStefano Zampini     m = A->rmap->n;
2181ccdfe979SStefano Zampini     n = B->rmap->n;
2182ccdfe979SStefano Zampini     break;
2183ccdfe979SStefano Zampini   case MATPRODUCT_PtAP:
2184ccdfe979SStefano Zampini     m = B->cmap->n;
2185ccdfe979SStefano Zampini     n = B->cmap->n;
2186ccdfe979SStefano Zampini     break;
2187ccdfe979SStefano Zampini   case MATPRODUCT_RARt:
2188ccdfe979SStefano Zampini     m = B->rmap->n;
2189ccdfe979SStefano Zampini     n = B->rmap->n;
2190ccdfe979SStefano Zampini     break;
2191ccdfe979SStefano Zampini   default:
2192ccdfe979SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2193ccdfe979SStefano Zampini   }
2194ccdfe979SStefano Zampini   ierr = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2195ccdfe979SStefano Zampini   /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */
2196ccdfe979SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQDENSE,&cisdense);CHKERRQ(ierr);
2197ccdfe979SStefano Zampini   ierr = MatSetType(C,MATSEQDENSECUDA);CHKERRQ(ierr);
2198ccdfe979SStefano Zampini 
2199ccdfe979SStefano Zampini   /* product data */
2200ccdfe979SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2201ccdfe979SStefano Zampini   mmdata->cisdense = cisdense;
2202afb2bd1cSJunchao Zhang  #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
2203afb2bd1cSJunchao Zhang   /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */
2204ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) {
2205afb2bd1cSJunchao Zhang     cudaError_t cerr = cudaMalloc((void**)&mmdata->Bt,(size_t)B->rmap->n*(size_t)B->cmap->n*sizeof(PetscScalar));CHKERRCUDA(cerr);
2206ccdfe979SStefano Zampini   }
2207afb2bd1cSJunchao Zhang  #endif
2208ccdfe979SStefano Zampini   /* for these products we need intermediate storage */
2209ccdfe979SStefano Zampini   if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) {
2210ccdfe979SStefano Zampini     ierr = MatCreate(PetscObjectComm((PetscObject)C),&mmdata->X);CHKERRQ(ierr);
2211ccdfe979SStefano Zampini     ierr = MatSetType(mmdata->X,MATSEQDENSECUDA);CHKERRQ(ierr);
2212ccdfe979SStefano Zampini     if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */
2213ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->rmap->n,A->rmap->n,B->rmap->n);CHKERRQ(ierr);
2214ccdfe979SStefano Zampini     } else {
2215ccdfe979SStefano Zampini       ierr = MatSetSizes(mmdata->X,A->rmap->n,B->cmap->n,A->rmap->n,B->cmap->n);CHKERRQ(ierr);
2216ccdfe979SStefano Zampini     }
2217ccdfe979SStefano Zampini   }
2218ccdfe979SStefano Zampini   C->product->data    = mmdata;
2219ccdfe979SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2220ccdfe979SStefano Zampini 
2221ccdfe979SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA;
2222ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2223ccdfe979SStefano Zampini }
2224ccdfe979SStefano Zampini 
2225fcdce8c4SStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2226ccdfe979SStefano Zampini {
2227ccdfe979SStefano Zampini   Mat_Product                  *product = C->product;
2228fcdce8c4SStefano Zampini   Mat                          A,B;
2229fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2230fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *c = (Mat_SeqAIJ*)C->data;
2231fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2232fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2233fcdce8c4SStefano Zampini   PetscBool                    flg;
2234ccdfe979SStefano Zampini   PetscErrorCode               ierr;
2235fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2236fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2237fcdce8c4SStefano Zampini   MatProductType               ptype;
2238fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2239fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2240fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2241fcdce8c4SStefano Zampini #endif
2242ccdfe979SStefano Zampini 
2243ccdfe979SStefano Zampini   PetscFunctionBegin;
2244ccdfe979SStefano Zampini   MatCheckProduct(C,1);
2245fcdce8c4SStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
2246fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)C,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2247fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for C of type %s",((PetscObject)C)->type_name);
2248fcdce8c4SStefano Zampini   mmdata = (MatMatCusparse*)C->product->data;
2249fcdce8c4SStefano Zampini   A = product->A;
2250fcdce8c4SStefano Zampini   B = product->B;
2251fcdce8c4SStefano Zampini   if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */
2252fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_FALSE;
2253fcdce8c4SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2254fcdce8c4SStefano Zampini     if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2255fcdce8c4SStefano Zampini     Cmat = Ccusp->mat;
2256fcdce8c4SStefano Zampini     if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[C->product->type]);
2257fcdce8c4SStefano Zampini     Ccsr = (CsrMatrix*)Cmat->mat;
2258fcdce8c4SStefano Zampini     if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2259fcdce8c4SStefano Zampini     goto finalize;
2260fcdce8c4SStefano Zampini   }
2261fcdce8c4SStefano Zampini   if (!c->nz) goto finalize;
2262fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2263fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2264fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2265fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2266fcdce8c4SStefano Zampini   if (A->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2267fcdce8c4SStefano Zampini   if (B->boundtocpu) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_ARG_WRONG,"Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases");
2268fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2269fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2270fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2271fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2272fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2273fcdce8c4SStefano Zampini   if (Ccusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2274fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2275fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2276fcdce8c4SStefano Zampini 
2277fcdce8c4SStefano Zampini   ptype = product->type;
2278fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2279fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2280fcdce8c4SStefano Zampini   switch (ptype) {
2281fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2282fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2283fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2284fcdce8c4SStefano Zampini     break;
2285fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2286fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2287fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2288fcdce8c4SStefano Zampini     break;
2289fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2290fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2291fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2292fcdce8c4SStefano Zampini     break;
2293fcdce8c4SStefano Zampini   default:
2294fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2295fcdce8c4SStefano Zampini   }
2296fcdce8c4SStefano Zampini   Cmat = Ccusp->mat;
2297fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2298fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2299fcdce8c4SStefano Zampini   if (!Cmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C mult struct for product type %s",MatProductTypes[ptype]);
2300fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2301fcdce8c4SStefano Zampini   Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix*)Bmat->mat; /* B may be in compressed row storage */
2302fcdce8c4SStefano Zampini   Ccsr = (CsrMatrix*)Cmat->mat;
2303fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2304fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2305fcdce8c4SStefano Zampini   if (!Ccsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing C CSR struct");
2306fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2307fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2308fcdce8c4SStefano Zampini   BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */
2309fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2310fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2311fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2312fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2313fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2314fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2315fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2316fcdce8c4SStefano Zampini #else
2317fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2318fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2319fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2320fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2321fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2322fcdce8c4SStefano Zampini #endif
2323fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2324fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2325fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2326fcdce8c4SStefano Zampini   C->offloadmask = PETSC_OFFLOAD_GPU;
2327fcdce8c4SStefano Zampini finalize:
2328fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
2329fcdce8c4SStefano Zampini   ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr);
2330fcdce8c4SStefano Zampini   ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
2331fcdce8c4SStefano Zampini   ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr);
2332fcdce8c4SStefano Zampini   c->reallocs         = 0;
2333fcdce8c4SStefano Zampini   C->info.mallocs    += 0;
2334fcdce8c4SStefano Zampini   C->info.nz_unneeded = 0;
2335fcdce8c4SStefano Zampini   C->assembled = C->was_assembled = PETSC_TRUE;
2336fcdce8c4SStefano Zampini   C->num_ass++;
2337ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2338ccdfe979SStefano Zampini }
2339fcdce8c4SStefano Zampini 
2340fcdce8c4SStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C)
2341fcdce8c4SStefano Zampini {
2342fcdce8c4SStefano Zampini   Mat_Product                  *product = C->product;
2343fcdce8c4SStefano Zampini   Mat                          A,B;
2344fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp,*Bcusp,*Ccusp;
2345fcdce8c4SStefano Zampini   Mat_SeqAIJ                   *a,*b,*c;
2346fcdce8c4SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Amat,*Bmat,*Cmat;
2347fcdce8c4SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
2348fcdce8c4SStefano Zampini   PetscInt                     i,j,m,n,k;
2349fcdce8c4SStefano Zampini   PetscBool                    flg;
2350fcdce8c4SStefano Zampini   PetscErrorCode               ierr;
2351fcdce8c4SStefano Zampini   cusparseStatus_t             stat;
2352fcdce8c4SStefano Zampini   cudaError_t                  cerr;
2353fcdce8c4SStefano Zampini   MatProductType               ptype;
2354fcdce8c4SStefano Zampini   MatMatCusparse               *mmdata;
2355fcdce8c4SStefano Zampini   PetscLogDouble               flops;
2356fcdce8c4SStefano Zampini   PetscBool                    biscompressed,ciscompressed;
2357fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2358fcdce8c4SStefano Zampini   int64_t                      C_num_rows1, C_num_cols1, C_nnz1;
2359fcdce8c4SStefano Zampini   size_t                       bufSize2;
2360fcdce8c4SStefano Zampini   cusparseSpMatDescr_t         BmatSpDescr;
2361fcdce8c4SStefano Zampini #else
2362fcdce8c4SStefano Zampini   int                          cnz;
2363fcdce8c4SStefano Zampini #endif
2364fcdce8c4SStefano Zampini 
2365fcdce8c4SStefano Zampini   PetscFunctionBegin;
2366fcdce8c4SStefano Zampini   MatCheckProduct(C,1);
2367fcdce8c4SStefano Zampini   if (C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
2368fcdce8c4SStefano Zampini   A    = product->A;
2369fcdce8c4SStefano Zampini   B    = product->B;
2370fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)A,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2371fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for type %s",((PetscObject)A)->type_name);
2372fcdce8c4SStefano Zampini   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQAIJCUSPARSE,&flg);CHKERRQ(ierr);
2373fcdce8c4SStefano Zampini   if (!flg) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for B of type %s",((PetscObject)B)->type_name);
2374fcdce8c4SStefano Zampini   a = (Mat_SeqAIJ*)A->data;
2375fcdce8c4SStefano Zampini   b = (Mat_SeqAIJ*)B->data;
2376fcdce8c4SStefano Zampini   Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
2377fcdce8c4SStefano Zampini   Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr;
2378fcdce8c4SStefano Zampini   if (Acusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2379fcdce8c4SStefano Zampini   if (Bcusp->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Only for MAT_CUSPARSE_CSR format");
2380fcdce8c4SStefano Zampini 
2381fcdce8c4SStefano Zampini   /* product data */
2382fcdce8c4SStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
2383fcdce8c4SStefano Zampini   C->product->data    = mmdata;
2384fcdce8c4SStefano Zampini   C->product->destroy = MatDestroy_MatMatCusparse;
2385fcdce8c4SStefano Zampini 
2386fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2387fcdce8c4SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
2388fcdce8c4SStefano Zampini   ptype = product->type;
2389fcdce8c4SStefano Zampini   if (A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
2390fcdce8c4SStefano Zampini   if (B->symmetric && ptype == MATPRODUCT_ABt) ptype = MATPRODUCT_AB;
2391fcdce8c4SStefano Zampini   biscompressed = PETSC_FALSE;
2392fcdce8c4SStefano Zampini   ciscompressed = PETSC_FALSE;
2393fcdce8c4SStefano Zampini   switch (ptype) {
2394fcdce8c4SStefano Zampini   case MATPRODUCT_AB:
2395fcdce8c4SStefano Zampini     m = A->rmap->n;
2396fcdce8c4SStefano Zampini     n = B->cmap->n;
2397fcdce8c4SStefano Zampini     k = A->cmap->n;
2398fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2399fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2400fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2401fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2402fcdce8c4SStefano Zampini     break;
2403fcdce8c4SStefano Zampini   case MATPRODUCT_AtB:
2404fcdce8c4SStefano Zampini     m = A->cmap->n;
2405fcdce8c4SStefano Zampini     n = B->cmap->n;
2406fcdce8c4SStefano Zampini     k = A->rmap->n;
24071a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
2408fcdce8c4SStefano Zampini     Amat = Acusp->matTranspose;
2409fcdce8c4SStefano Zampini     Bmat = Bcusp->mat;
2410fcdce8c4SStefano Zampini     if (b->compressedrow.use) biscompressed = PETSC_TRUE;
2411fcdce8c4SStefano Zampini     break;
2412fcdce8c4SStefano Zampini   case MATPRODUCT_ABt:
2413fcdce8c4SStefano Zampini     m = A->rmap->n;
2414fcdce8c4SStefano Zampini     n = B->rmap->n;
2415fcdce8c4SStefano Zampini     k = A->cmap->n;
24161a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
2417fcdce8c4SStefano Zampini     Amat = Acusp->mat;
2418fcdce8c4SStefano Zampini     Bmat = Bcusp->matTranspose;
2419fcdce8c4SStefano Zampini     if (a->compressedrow.use) ciscompressed = PETSC_TRUE;
2420fcdce8c4SStefano Zampini     break;
2421fcdce8c4SStefano Zampini   default:
2422fcdce8c4SStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]);
2423fcdce8c4SStefano Zampini   }
2424fcdce8c4SStefano Zampini 
2425fcdce8c4SStefano Zampini   /* create cusparse matrix */
2426fcdce8c4SStefano Zampini   ierr  = MatSetSizes(C,m,n,m,n);CHKERRQ(ierr);
2427fcdce8c4SStefano Zampini   ierr  = MatSetType(C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2428fcdce8c4SStefano Zampini   c     = (Mat_SeqAIJ*)C->data;
2429fcdce8c4SStefano Zampini   Ccusp = (Mat_SeqAIJCUSPARSE*)C->spptr;
2430fcdce8c4SStefano Zampini   Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
2431fcdce8c4SStefano Zampini   Ccsr  = new CsrMatrix;
2432fcdce8c4SStefano Zampini 
2433fcdce8c4SStefano Zampini   c->compressedrow.use = ciscompressed;
2434fcdce8c4SStefano Zampini   if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */
2435fcdce8c4SStefano Zampini     c->compressedrow.nrows = a->compressedrow.nrows;
2436fcdce8c4SStefano Zampini     ierr = PetscMalloc2(c->compressedrow.nrows+1,&c->compressedrow.i,c->compressedrow.nrows,&c->compressedrow.rindex);CHKERRQ(ierr);
2437fcdce8c4SStefano Zampini     ierr = PetscArraycpy(c->compressedrow.rindex,a->compressedrow.rindex,c->compressedrow.nrows);CHKERRQ(ierr);
2438fcdce8c4SStefano Zampini     Ccusp->workVector  = new THRUSTARRAY(c->compressedrow.nrows);
2439fcdce8c4SStefano Zampini     Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows);
2440fcdce8c4SStefano Zampini     Cmat->cprowIndices->assign(c->compressedrow.rindex,c->compressedrow.rindex + c->compressedrow.nrows);
2441fcdce8c4SStefano Zampini   } else {
2442fcdce8c4SStefano Zampini     c->compressedrow.nrows  = 0;
2443fcdce8c4SStefano Zampini     c->compressedrow.i      = NULL;
2444fcdce8c4SStefano Zampini     c->compressedrow.rindex = NULL;
2445fcdce8c4SStefano Zampini     Ccusp->workVector       = NULL;
2446fcdce8c4SStefano Zampini     Cmat->cprowIndices      = NULL;
2447fcdce8c4SStefano Zampini   }
2448fcdce8c4SStefano Zampini   Ccusp->nrows    = ciscompressed ? c->compressedrow.nrows : m;
2449fcdce8c4SStefano Zampini   Ccusp->mat      = Cmat;
2450fcdce8c4SStefano Zampini   Ccusp->mat->mat = Ccsr;
2451fcdce8c4SStefano Zampini   Ccsr->num_rows    = Ccusp->nrows;
2452fcdce8c4SStefano Zampini   Ccsr->num_cols    = n;
2453fcdce8c4SStefano Zampini   Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows+1);
2454fcdce8c4SStefano Zampini   stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
2455fcdce8c4SStefano Zampini   stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
2456fcdce8c4SStefano Zampini   stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
2457fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
2458fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
2459fcdce8c4SStefano Zampini   cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
2460fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2461fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2462fcdce8c4SStefano Zampini   cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
2463fcdce8c4SStefano Zampini   if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */
2464fcdce8c4SStefano Zampini     thrust::fill(thrust::device,Ccsr->row_offsets->begin(),Ccsr->row_offsets->end(),0);
2465fcdce8c4SStefano Zampini     c->nz = 0;
2466fcdce8c4SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2467fcdce8c4SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
2468fcdce8c4SStefano Zampini     goto finalizesym;
2469fcdce8c4SStefano Zampini   }
2470fcdce8c4SStefano Zampini 
2471fcdce8c4SStefano Zampini   if (!Amat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A mult struct for product type %s",MatProductTypes[ptype]);
2472fcdce8c4SStefano Zampini   if (!Bmat) SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B mult struct for product type %s",MatProductTypes[ptype]);
2473fcdce8c4SStefano Zampini   Acsr = (CsrMatrix*)Amat->mat;
2474fcdce8c4SStefano Zampini   if (!biscompressed) {
2475fcdce8c4SStefano Zampini     Bcsr = (CsrMatrix*)Bmat->mat;
2476fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2477fcdce8c4SStefano Zampini     BmatSpDescr = Bmat->matDescr;
2478fcdce8c4SStefano Zampini #endif
2479fcdce8c4SStefano Zampini   } else { /* we need to use row offsets for the full matrix */
2480fcdce8c4SStefano Zampini     CsrMatrix *cBcsr = (CsrMatrix*)Bmat->mat;
2481fcdce8c4SStefano Zampini     Bcsr = new CsrMatrix;
2482fcdce8c4SStefano Zampini     Bcsr->num_rows       = B->rmap->n;
2483fcdce8c4SStefano Zampini     Bcsr->num_cols       = cBcsr->num_cols;
2484fcdce8c4SStefano Zampini     Bcsr->num_entries    = cBcsr->num_entries;
2485fcdce8c4SStefano Zampini     Bcsr->column_indices = cBcsr->column_indices;
2486fcdce8c4SStefano Zampini     Bcsr->values         = cBcsr->values;
2487fcdce8c4SStefano Zampini     if (!Bcusp->rowoffsets_gpu) {
2488fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
2489fcdce8c4SStefano Zampini       Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
2490fcdce8c4SStefano Zampini       ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
2491fcdce8c4SStefano Zampini     }
2492fcdce8c4SStefano Zampini     Bcsr->row_offsets = Bcusp->rowoffsets_gpu;
2493fcdce8c4SStefano Zampini     mmdata->Bcsr = Bcsr;
2494fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2495fcdce8c4SStefano Zampini     if (Bcsr->num_rows && Bcsr->num_cols) {
2496fcdce8c4SStefano Zampini       stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries,
2497fcdce8c4SStefano Zampini                                Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2498fcdce8c4SStefano Zampini                                Bcsr->values->data().get(),
2499fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2500fcdce8c4SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2501fcdce8c4SStefano Zampini     }
2502fcdce8c4SStefano Zampini     BmatSpDescr = mmdata->matSpBDescr;
2503fcdce8c4SStefano Zampini #endif
2504fcdce8c4SStefano Zampini   }
2505fcdce8c4SStefano Zampini   if (!Acsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing A CSR struct");
2506fcdce8c4SStefano Zampini   if (!Bcsr) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Missing B CSR struct");
2507fcdce8c4SStefano Zampini   /* precompute flops count */
2508fcdce8c4SStefano Zampini   if (ptype == MATPRODUCT_AB) {
2509fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2510fcdce8c4SStefano Zampini       const PetscInt st = a->i[i];
2511fcdce8c4SStefano Zampini       const PetscInt en = a->i[i+1];
2512fcdce8c4SStefano Zampini       for (j=st; j<en; j++) {
2513fcdce8c4SStefano Zampini         const PetscInt brow = a->j[j];
2514fcdce8c4SStefano Zampini         flops += 2.*(b->i[brow+1] - b->i[brow]);
2515fcdce8c4SStefano Zampini       }
2516fcdce8c4SStefano Zampini     }
2517fcdce8c4SStefano Zampini   } else if (ptype == MATPRODUCT_AtB) {
2518fcdce8c4SStefano Zampini     for (i=0, flops = 0; i<A->rmap->n; i++) {
2519fcdce8c4SStefano Zampini       const PetscInt anzi = a->i[i+1] - a->i[i];
2520fcdce8c4SStefano Zampini       const PetscInt bnzi = b->i[i+1] - b->i[i];
2521fcdce8c4SStefano Zampini       flops += (2.*anzi)*bnzi;
2522fcdce8c4SStefano Zampini     }
2523fcdce8c4SStefano Zampini   } else { /* TODO */
2524fcdce8c4SStefano Zampini     flops = 0.;
2525fcdce8c4SStefano Zampini   }
2526fcdce8c4SStefano Zampini 
2527fcdce8c4SStefano Zampini   mmdata->flops = flops;
2528fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2529fcdce8c4SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2530fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2531fcdce8c4SStefano Zampini   stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0,
2532fcdce8c4SStefano Zampini                            NULL, NULL, NULL,
2533fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
2534fcdce8c4SStefano Zampini                            CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
2535fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_createDescr(&mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2536fcdce8c4SStefano Zampini   /* ask bufferSize bytes for external memory */
2537fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2538fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2539fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2540fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, NULL);CHKERRCUSPARSE(stat);
2541bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer2, bufSize2);CHKERRCUDA(cerr);
2542fcdce8c4SStefano Zampini   /* inspect the matrices A and B to understand the memory requirement for the next step */
2543fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_workEstimation(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2544fcdce8c4SStefano Zampini                                        Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2545fcdce8c4SStefano Zampini                                        cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2546fcdce8c4SStefano Zampini                                        mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2);CHKERRCUSPARSE(stat);
2547fcdce8c4SStefano Zampini   /* ask bufferSize again bytes for external memory */
2548fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2549fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2550fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2551fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL);CHKERRCUSPARSE(stat);
2552fcdce8c4SStefano Zampini   /* The CUSPARSE documentation is not clear, nor the API
2553fcdce8c4SStefano Zampini      We need both buffers to perform the operations properly!
2554fcdce8c4SStefano Zampini      mmdata->mmBuffer2 does not appear anywhere in the compute/copy API
2555fcdce8c4SStefano Zampini      it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address
2556fcdce8c4SStefano Zampini      is stored in the descriptor! What a messy API... */
2557bfcc3627SStefano Zampini   cerr = cudaMalloc((void**) &mmdata->mmBuffer, mmdata->mmBufferSize);CHKERRCUDA(cerr);
2558fcdce8c4SStefano Zampini   /* compute the intermediate product of A * B */
2559fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_compute(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2560fcdce8c4SStefano Zampini                                 Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2561fcdce8c4SStefano Zampini                                 cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT,
2562fcdce8c4SStefano Zampini                                 mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer);CHKERRCUSPARSE(stat);
2563fcdce8c4SStefano Zampini   /* get matrix C non-zero entries C_nnz1 */
2564fcdce8c4SStefano Zampini   stat = cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1);CHKERRCUSPARSE(stat);
2565fcdce8c4SStefano Zampini   c->nz = (PetscInt) C_nnz1;
256600702c57SStefano Zampini   ierr = PetscInfo9(C,"Buffer sizes for type %s, result %D x %D (k %D, nzA %D, nzB %D, nzC %D) are: %ldKB %ldKB\n",MatProductTypes[ptype],m,n,k,a->nz,b->nz,c->nz,bufSize2/1024,mmdata->mmBufferSize/1024);CHKERRQ(ierr);
2567fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2568fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2569fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2570fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2571fcdce8c4SStefano Zampini   stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(),
2572fcdce8c4SStefano Zampini                                 Ccsr->values->data().get());CHKERRCUSPARSE(stat);
2573fcdce8c4SStefano Zampini   stat = cusparseSpGEMM_copy(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2574fcdce8c4SStefano Zampini                              Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr,
2575fcdce8c4SStefano Zampini                              cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc);CHKERRCUSPARSE(stat);
2576fcdce8c4SStefano Zampini #else
2577fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
2578fcdce8c4SStefano Zampini   stat = cusparseXcsrgemmNnz(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2579fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2580fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2581fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2582fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->row_offsets->data().get(), &cnz);CHKERRCUSPARSE(stat);
2583fcdce8c4SStefano Zampini   c->nz = cnz;
2584fcdce8c4SStefano Zampini   Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
2585fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2586fcdce8c4SStefano Zampini   Ccsr->values = new THRUSTARRAY(c->nz);
2587fcdce8c4SStefano Zampini   CHKERRCUDA(cudaPeekAtLastError()); /* catch out of memory errors */
2588fcdce8c4SStefano Zampini 
2589fcdce8c4SStefano Zampini   stat = cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
2590fcdce8c4SStefano Zampini   /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only.
2591fcdce8c4SStefano Zampini      I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when
2592fcdce8c4SStefano Zampini      D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */
2593fcdce8c4SStefano Zampini   stat = cusparse_csr_spgemm(Ccusp->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE,
2594fcdce8c4SStefano Zampini                              Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols,
2595fcdce8c4SStefano Zampini                              Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(),
2596fcdce8c4SStefano Zampini                              Bmat->descr, Bcsr->num_entries, Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(),
2597fcdce8c4SStefano Zampini                              Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get());CHKERRCUSPARSE(stat);
2598fcdce8c4SStefano Zampini #endif
2599fcdce8c4SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
2600fcdce8c4SStefano Zampini   ierr = PetscLogGpuFlops(mmdata->flops);CHKERRQ(ierr);
2601fcdce8c4SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2602fcdce8c4SStefano Zampini finalizesym:
2603fcdce8c4SStefano Zampini   c->singlemalloc = PETSC_FALSE;
2604fcdce8c4SStefano Zampini   c->free_a       = PETSC_TRUE;
2605fcdce8c4SStefano Zampini   c->free_ij      = PETSC_TRUE;
2606fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
2607fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
2608fcdce8c4SStefano Zampini   if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
2609fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2610fcdce8c4SStefano Zampini     THRUSTINTARRAY ii(Ccsr->row_offsets->size());
2611fcdce8c4SStefano Zampini     THRUSTINTARRAY jj(Ccsr->column_indices->size());
2612fcdce8c4SStefano Zampini     ii   = *Ccsr->row_offsets;
2613fcdce8c4SStefano Zampini     jj   = *Ccsr->column_indices;
2614fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2615fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2616fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2617fcdce8c4SStefano Zampini   } else {
2618fcdce8c4SStefano Zampini     PetscInt *d_i = c->i;
2619fcdce8c4SStefano Zampini     if (ciscompressed) d_i = c->compressedrow.i;
2620fcdce8c4SStefano Zampini     cerr = cudaMemcpy(d_i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2621fcdce8c4SStefano Zampini     cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
2622fcdce8c4SStefano Zampini   }
2623fcdce8c4SStefano Zampini   if (ciscompressed) { /* need to expand host row offsets */
2624fcdce8c4SStefano Zampini     PetscInt r = 0;
2625fcdce8c4SStefano Zampini     c->i[0] = 0;
2626fcdce8c4SStefano Zampini     for (k = 0; k < c->compressedrow.nrows; k++) {
2627fcdce8c4SStefano Zampini       const PetscInt next = c->compressedrow.rindex[k];
2628fcdce8c4SStefano Zampini       const PetscInt old = c->compressedrow.i[k];
2629fcdce8c4SStefano Zampini       for (; r < next; r++) c->i[r+1] = old;
2630fcdce8c4SStefano Zampini     }
2631fcdce8c4SStefano Zampini     for (; r < m; r++) c->i[r+1] = c->compressedrow.i[c->compressedrow.nrows];
2632fcdce8c4SStefano Zampini   }
2633fcdce8c4SStefano Zampini   ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
2634fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
2635fcdce8c4SStefano Zampini   ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
2636fcdce8c4SStefano Zampini   c->maxnz = c->nz;
2637fcdce8c4SStefano Zampini   c->nonzerorowcnt = 0;
2638fcdce8c4SStefano Zampini   c->rmax = 0;
2639fcdce8c4SStefano Zampini   for (k = 0; k < m; k++) {
2640fcdce8c4SStefano Zampini     const PetscInt nn = c->i[k+1] - c->i[k];
2641fcdce8c4SStefano Zampini     c->ilen[k] = c->imax[k] = nn;
2642fcdce8c4SStefano Zampini     c->nonzerorowcnt += (PetscInt)!!nn;
2643fcdce8c4SStefano Zampini     c->rmax = PetscMax(c->rmax,nn);
2644fcdce8c4SStefano Zampini   }
2645fcdce8c4SStefano Zampini   ierr = MatMarkDiagonal_SeqAIJ(C);CHKERRQ(ierr);
2646fcdce8c4SStefano Zampini   ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
2647fcdce8c4SStefano Zampini   Ccsr->num_entries = c->nz;
2648fcdce8c4SStefano Zampini 
2649fcdce8c4SStefano Zampini   C->nonzerostate++;
2650fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
2651fcdce8c4SStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
2652fcdce8c4SStefano Zampini   Ccusp->nonzerostate = C->nonzerostate;
2653fcdce8c4SStefano Zampini   C->offloadmask   = PETSC_OFFLOAD_UNALLOCATED;
2654fcdce8c4SStefano Zampini   C->preallocated  = PETSC_TRUE;
2655fcdce8c4SStefano Zampini   C->assembled     = PETSC_FALSE;
2656fcdce8c4SStefano Zampini   C->was_assembled = PETSC_FALSE;
2657abb89eb1SStefano Zampini   if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */
2658fcdce8c4SStefano Zampini     mmdata->reusesym = PETSC_TRUE;
2659fcdce8c4SStefano Zampini     C->offloadmask   = PETSC_OFFLOAD_GPU;
2660fcdce8c4SStefano Zampini   }
2661fcdce8c4SStefano Zampini   C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2662fcdce8c4SStefano Zampini   PetscFunctionReturn(0);
2663fcdce8c4SStefano Zampini }
2664fcdce8c4SStefano Zampini 
2665fcdce8c4SStefano Zampini PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat);
2666fcdce8c4SStefano Zampini 
2667fcdce8c4SStefano Zampini /* handles sparse or dense B */
2668fcdce8c4SStefano Zampini static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat)
2669fcdce8c4SStefano Zampini {
2670fcdce8c4SStefano Zampini   Mat_Product    *product = mat->product;
2671fcdce8c4SStefano Zampini   PetscErrorCode ierr;
2672fcdce8c4SStefano Zampini   PetscBool      isdense = PETSC_FALSE,Biscusp = PETSC_FALSE,Ciscusp = PETSC_TRUE;
2673fcdce8c4SStefano Zampini 
2674fcdce8c4SStefano Zampini   PetscFunctionBegin;
2675fcdce8c4SStefano Zampini   MatCheckProduct(mat,1);
2676fcdce8c4SStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)product->B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
2677abb89eb1SStefano Zampini   if (!product->A->boundtocpu && !product->B->boundtocpu) {
2678fcdce8c4SStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
2679fcdce8c4SStefano Zampini   }
2680fcdce8c4SStefano Zampini   if (product->type == MATPRODUCT_ABC) {
2681fcdce8c4SStefano Zampini     Ciscusp = PETSC_FALSE;
2682fcdce8c4SStefano Zampini     if (!product->C->boundtocpu) {
2683fcdce8c4SStefano Zampini       ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJCUSPARSE,&Ciscusp);CHKERRQ(ierr);
2684fcdce8c4SStefano Zampini     }
2685fcdce8c4SStefano Zampini   }
2686fcdce8c4SStefano Zampini   if (isdense) {
2687ccdfe979SStefano Zampini     switch (product->type) {
2688ccdfe979SStefano Zampini     case MATPRODUCT_AB:
2689ccdfe979SStefano Zampini     case MATPRODUCT_AtB:
2690ccdfe979SStefano Zampini     case MATPRODUCT_ABt:
2691ccdfe979SStefano Zampini     case MATPRODUCT_PtAP:
2692ccdfe979SStefano Zampini     case MATPRODUCT_RARt:
2693fcdce8c4SStefano Zampini      if (product->A->boundtocpu) {
2694fcdce8c4SStefano Zampini         ierr = MatProductSetFromOptions_SeqAIJ_SeqDense(mat);CHKERRQ(ierr);
2695fcdce8c4SStefano Zampini       } else {
2696fcdce8c4SStefano Zampini         mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA;
2697fcdce8c4SStefano Zampini       }
2698fcdce8c4SStefano Zampini       break;
2699fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2700fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2701fcdce8c4SStefano Zampini       break;
2702ccdfe979SStefano Zampini     default:
2703ccdfe979SStefano Zampini       break;
2704ccdfe979SStefano Zampini     }
2705fcdce8c4SStefano Zampini   } else if (Biscusp && Ciscusp) {
2706fcdce8c4SStefano Zampini     switch (product->type) {
2707fcdce8c4SStefano Zampini     case MATPRODUCT_AB:
2708fcdce8c4SStefano Zampini     case MATPRODUCT_AtB:
2709fcdce8c4SStefano Zampini     case MATPRODUCT_ABt:
2710fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE;
2711fcdce8c4SStefano Zampini       break;
2712fcdce8c4SStefano Zampini     case MATPRODUCT_PtAP:
2713fcdce8c4SStefano Zampini     case MATPRODUCT_RARt:
2714fcdce8c4SStefano Zampini     case MATPRODUCT_ABC:
2715fcdce8c4SStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
2716fcdce8c4SStefano Zampini       break;
2717fcdce8c4SStefano Zampini     default:
2718fcdce8c4SStefano Zampini       break;
2719fcdce8c4SStefano Zampini     }
2720fcdce8c4SStefano Zampini   } else { /* fallback for AIJ */
2721fcdce8c4SStefano Zampini     ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr);
2722fcdce8c4SStefano Zampini   }
2723ccdfe979SStefano Zampini   PetscFunctionReturn(0);
2724ccdfe979SStefano Zampini }
2725ccdfe979SStefano Zampini 
27266fa9248bSJed Brown static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
27279ae82921SPaul Mullowney {
2728b175d8bbSPaul Mullowney   PetscErrorCode ierr;
27299ae82921SPaul Mullowney 
27309ae82921SPaul Mullowney   PetscFunctionBegin;
2731e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2732e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2733e6e9a74fSStefano Zampini }
2734e6e9a74fSStefano Zampini 
2735e6e9a74fSStefano Zampini static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy, Vec zz)
2736e6e9a74fSStefano Zampini {
2737e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2738e6e9a74fSStefano Zampini 
2739e6e9a74fSStefano Zampini   PetscFunctionBegin;
2740e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_FALSE,PETSC_FALSE);CHKERRQ(ierr);
2741e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2742e6e9a74fSStefano Zampini }
2743e6e9a74fSStefano Zampini 
2744e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2745e6e9a74fSStefano Zampini {
2746e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2747e6e9a74fSStefano Zampini 
2748e6e9a74fSStefano Zampini   PetscFunctionBegin;
2749e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
2750e6e9a74fSStefano Zampini   PetscFunctionReturn(0);
2751e6e9a74fSStefano Zampini }
2752e6e9a74fSStefano Zampini 
2753e6e9a74fSStefano Zampini static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2754e6e9a74fSStefano Zampini {
2755e6e9a74fSStefano Zampini   PetscErrorCode ierr;
2756e6e9a74fSStefano Zampini 
2757e6e9a74fSStefano Zampini   PetscFunctionBegin;
2758e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_TRUE);CHKERRQ(ierr);
27599ae82921SPaul Mullowney   PetscFunctionReturn(0);
27609ae82921SPaul Mullowney }
27619ae82921SPaul Mullowney 
27626fa9248bSJed Brown static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2763ca45077fSPaul Mullowney {
2764b175d8bbSPaul Mullowney   PetscErrorCode ierr;
2765ca45077fSPaul Mullowney 
2766ca45077fSPaul Mullowney   PetscFunctionBegin;
2767e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,NULL,yy,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2768ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2769ca45077fSPaul Mullowney }
2770ca45077fSPaul Mullowney 
2771a0e72f99SJunchao Zhang __global__ static void ScatterAdd(PetscInt n, PetscInt *idx,const PetscScalar *x,PetscScalar *y)
2772a0e72f99SJunchao Zhang {
2773a0e72f99SJunchao Zhang   int i = blockIdx.x*blockDim.x + threadIdx.x;
2774a0e72f99SJunchao Zhang   if (i < n) y[idx[i]] += x[i];
2775a0e72f99SJunchao Zhang }
2776a0e72f99SJunchao Zhang 
2777afb2bd1cSJunchao Zhang /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */
2778e6e9a74fSStefano Zampini static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz,PetscBool trans,PetscBool herm)
27799ae82921SPaul Mullowney {
27809ae82921SPaul Mullowney   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data;
2781aa372e3fSPaul Mullowney   Mat_SeqAIJCUSPARSE           *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
27829ff858a8SKarl Rupp   Mat_SeqAIJCUSPARSEMultStruct *matstruct;
2783e6e9a74fSStefano Zampini   PetscScalar                  *xarray,*zarray,*dptr,*beta,*xptr;
2784b175d8bbSPaul Mullowney   PetscErrorCode               ierr;
278557d48284SJunchao Zhang   cudaError_t                  cerr;
2786aa372e3fSPaul Mullowney   cusparseStatus_t             stat;
2787e6e9a74fSStefano Zampini   cusparseOperation_t          opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
2788e6e9a74fSStefano Zampini   PetscBool                    compressed;
2789afb2bd1cSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2790afb2bd1cSJunchao Zhang   PetscInt                     nx,ny;
2791afb2bd1cSJunchao Zhang #endif
27926e111a19SKarl Rupp 
27939ae82921SPaul Mullowney   PetscFunctionBegin;
2794e6e9a74fSStefano Zampini   if (herm && !trans) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Hermitian and not transpose not supported");
2795e6e9a74fSStefano Zampini   if (!a->nonzerorowcnt) {
2796afb2bd1cSJunchao Zhang     if (!yy) {ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);}
2797d38a13f6SStefano Zampini     else {ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr);}
2798e6e9a74fSStefano Zampini     PetscFunctionReturn(0);
2799e6e9a74fSStefano Zampini   }
280034d6c7a5SJose E. Roman   /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */
280134d6c7a5SJose E. Roman   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
2802e6e9a74fSStefano Zampini   if (!trans) {
28039ff858a8SKarl Rupp     matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2804c9567895SMark     if (!matstruct) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"SeqAIJCUSPARSE does not have a 'mat' (need to fix)");
2805e6e9a74fSStefano Zampini   } else {
28061a2c6b5cSJunchao Zhang     if (herm || !A->form_explicit_transpose) {
2807e6e9a74fSStefano Zampini       opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE;
2808e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
2809e6e9a74fSStefano Zampini     } else {
28101a2c6b5cSJunchao Zhang       if (!cusparsestruct->matTranspose) {ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);}
2811e6e9a74fSStefano Zampini       matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->matTranspose;
2812e6e9a74fSStefano Zampini     }
2813e6e9a74fSStefano Zampini   }
2814e6e9a74fSStefano Zampini   /* Does the matrix use compressed rows (i.e., drop zero rows)? */
2815e6e9a74fSStefano Zampini   compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE;
2816213423ffSJunchao Zhang 
2817e6e9a74fSStefano Zampini   try {
2818e6e9a74fSStefano Zampini     ierr = VecCUDAGetArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2819213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDAGetArray(zz,&zarray);CHKERRQ(ierr);} /* read & write zz, so need to get uptodate zarray on GPU */
2820213423ffSJunchao Zhang     else {ierr = VecCUDAGetArrayWrite(zz,&zarray);CHKERRQ(ierr);} /* write zz, so no need to init zarray on GPU */
2821afb2bd1cSJunchao Zhang 
282285ba7357SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2823e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2824afb2bd1cSJunchao Zhang       /* z = A x + beta y.
2825afb2bd1cSJunchao Zhang          If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax.
2826afb2bd1cSJunchao Zhang          When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call.
2827afb2bd1cSJunchao Zhang       */
2828e6e9a74fSStefano Zampini       xptr = xarray;
2829afb2bd1cSJunchao Zhang       dptr = compressed ? cusparsestruct->workVector->data().get() : zarray;
2830213423ffSJunchao Zhang       beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero;
2831afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2832afb2bd1cSJunchao Zhang       /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is
2833afb2bd1cSJunchao Zhang           allocated to accommodate different uses. So we get the length info directly from mat.
2834afb2bd1cSJunchao Zhang        */
2835afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2836afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2837afb2bd1cSJunchao Zhang         nx = mat->num_cols;
2838afb2bd1cSJunchao Zhang         ny = mat->num_rows;
2839afb2bd1cSJunchao Zhang       }
2840afb2bd1cSJunchao Zhang      #endif
2841e6e9a74fSStefano Zampini     } else {
2842afb2bd1cSJunchao Zhang       /* z = A^T x + beta y
2843afb2bd1cSJunchao Zhang          If A is compressed, then we need a work vector as the shorter version of x to compute A^T x.
2844afb2bd1cSJunchao Zhang          Note A^Tx is of full length, so we set beta to 1.0 if y exists.
2845afb2bd1cSJunchao Zhang        */
2846afb2bd1cSJunchao Zhang       xptr = compressed ? cusparsestruct->workVector->data().get() : xarray;
2847e6e9a74fSStefano Zampini       dptr = zarray;
2848e6e9a74fSStefano Zampini       beta = yy ? matstruct->beta_one : matstruct->beta_zero;
2849afb2bd1cSJunchao Zhang       if (compressed) { /* Scatter x to work vector */
2850e6e9a74fSStefano Zampini         thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray);
2851a0e72f99SJunchao Zhang         thrust::for_each(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))),
2852e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2853e6e9a74fSStefano Zampini                          VecCUDAEqualsReverse());
2854e6e9a74fSStefano Zampini       }
2855afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2856afb2bd1cSJunchao Zhang       if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2857afb2bd1cSJunchao Zhang         CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2858afb2bd1cSJunchao Zhang         nx = mat->num_rows;
2859afb2bd1cSJunchao Zhang         ny = mat->num_cols;
2860afb2bd1cSJunchao Zhang       }
2861afb2bd1cSJunchao Zhang      #endif
2862e6e9a74fSStefano Zampini     }
28639ae82921SPaul Mullowney 
2864afb2bd1cSJunchao Zhang     /* csr_spmv does y = alpha op(A) x + beta y */
2865aa372e3fSPaul Mullowney     if (cusparsestruct->format == MAT_CUSPARSE_CSR) {
2866afb2bd1cSJunchao Zhang      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2867afb2bd1cSJunchao Zhang       if (opA < 0 || opA > 2) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly");
2868afb2bd1cSJunchao Zhang       if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */
2869afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr,nx,xptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2870afb2bd1cSJunchao Zhang         stat = cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr,ny,dptr,cusparse_scalartype);CHKERRCUSPARSE(stat);
2871afb2bd1cSJunchao Zhang         stat = cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one,
2872afb2bd1cSJunchao Zhang                                 matstruct->matDescr,
2873afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecXDescr, beta,
2874afb2bd1cSJunchao Zhang                                 matstruct->cuSpMV[opA].vecYDescr,
2875afb2bd1cSJunchao Zhang                                 cusparse_scalartype,
2876afb2bd1cSJunchao Zhang                                 cusparsestruct->spmvAlg,
2877afb2bd1cSJunchao Zhang                                 &matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUSPARSE(stat);
2878afb2bd1cSJunchao Zhang         cerr = cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer,matstruct->cuSpMV[opA].spmvBufferSize);CHKERRCUDA(cerr);
2879afb2bd1cSJunchao Zhang 
2880afb2bd1cSJunchao Zhang         matstruct->cuSpMV[opA].initialized = PETSC_TRUE;
2881afb2bd1cSJunchao Zhang       } else {
2882afb2bd1cSJunchao Zhang         /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */
2883afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr,xptr);CHKERRCUSPARSE(stat);
2884afb2bd1cSJunchao Zhang         stat = cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr,dptr);CHKERRCUSPARSE(stat);
2885afb2bd1cSJunchao Zhang       }
2886afb2bd1cSJunchao Zhang 
2887afb2bd1cSJunchao Zhang       stat = cusparseSpMV(cusparsestruct->handle, opA,
2888afb2bd1cSJunchao Zhang                                matstruct->alpha_one,
28891a2c6b5cSJunchao Zhang                                matstruct->matDescr, /* built in MatSeqAIJCUSPARSECopyToGPU() or MatSeqAIJCUSPARSEFormExplicitTransposeForMult() */
2890afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecXDescr,
2891afb2bd1cSJunchao Zhang                                beta,
2892afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].vecYDescr,
2893afb2bd1cSJunchao Zhang                                cusparse_scalartype,
2894afb2bd1cSJunchao Zhang                                cusparsestruct->spmvAlg,
2895afb2bd1cSJunchao Zhang                                matstruct->cuSpMV[opA].spmvBuffer);CHKERRCUSPARSE(stat);
2896afb2bd1cSJunchao Zhang      #else
28977656d835SStefano Zampini       CsrMatrix *mat = (CsrMatrix*)matstruct->mat;
2898e6e9a74fSStefano Zampini       stat = cusparse_csr_spmv(cusparsestruct->handle, opA,
2899a65300a6SPaul Mullowney                                mat->num_rows, mat->num_cols,
2900afb2bd1cSJunchao Zhang                                mat->num_entries, matstruct->alpha_one, matstruct->descr,
2901aa372e3fSPaul Mullowney                                mat->values->data().get(), mat->row_offsets->data().get(),
2902e6e9a74fSStefano Zampini                                mat->column_indices->data().get(), xptr, beta,
290357d48284SJunchao Zhang                                dptr);CHKERRCUSPARSE(stat);
2904afb2bd1cSJunchao Zhang      #endif
2905aa372e3fSPaul Mullowney     } else {
2906213423ffSJunchao Zhang       if (cusparsestruct->nrows) {
2907afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
2908afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
2909afb2bd1cSJunchao Zhang        #else
2910301298b4SMark Adams         cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat;
2911e6e9a74fSStefano Zampini         stat = cusparse_hyb_spmv(cusparsestruct->handle, opA,
2912afb2bd1cSJunchao Zhang                                  matstruct->alpha_one, matstruct->descr, hybMat,
2913e6e9a74fSStefano Zampini                                  xptr, beta,
291457d48284SJunchao Zhang                                  dptr);CHKERRCUSPARSE(stat);
2915afb2bd1cSJunchao Zhang        #endif
2916a65300a6SPaul Mullowney       }
2917aa372e3fSPaul Mullowney     }
291805035670SJunchao Zhang     cerr = WaitForCUDA();CHKERRCUDA(cerr);
2919958c4211Shannah_mairs     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2920aa372e3fSPaul Mullowney 
2921e6e9a74fSStefano Zampini     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
2922213423ffSJunchao Zhang       if (yy) { /* MatMultAdd: zz = A*xx + yy */
2923213423ffSJunchao Zhang         if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */
2924213423ffSJunchao Zhang           ierr = VecCopy_SeqCUDA(yy,zz);CHKERRQ(ierr); /* zz = yy */
2925e6e9a74fSStefano Zampini         } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */
2926213423ffSJunchao Zhang           ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
29277656d835SStefano Zampini         }
2928213423ffSJunchao Zhang       } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */
2929c1fb3f03SStefano Zampini         ierr = VecSet_SeqCUDA(zz,0);CHKERRQ(ierr);
29307656d835SStefano Zampini       }
29317656d835SStefano Zampini 
2932213423ffSJunchao Zhang       /* ScatterAdd the result from work vector into the full vector when A is compressed */
2933213423ffSJunchao Zhang       if (compressed) {
2934e6e9a74fSStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
2935a0e72f99SJunchao Zhang         /* I wanted to make this for_each asynchronous but failed. thrust::async::for_each() returns an event (internally registerred)
2936a0e72f99SJunchao Zhang            and in the destructor of the scope, it will call cudaStreamSynchronize() on this stream. One has to store all events to
2937a0e72f99SJunchao Zhang            prevent that. So I just add a ScatterAdd kernel.
2938a0e72f99SJunchao Zhang          */
2939a0e72f99SJunchao Zhang        #if 0
2940a0e72f99SJunchao Zhang         thrust::device_ptr<PetscScalar> zptr = thrust::device_pointer_cast(zarray);
2941a0e72f99SJunchao Zhang         thrust::async::for_each(thrust::cuda::par.on(cusparsestruct->stream),
2942a0e72f99SJunchao Zhang                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))),
2943e6e9a74fSStefano Zampini                          thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(zptr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(),
2944c41cb2e2SAlejandro Lamas Daviña                          VecCUDAPlusEquals());
2945a0e72f99SJunchao Zhang        #else
2946a0e72f99SJunchao Zhang         PetscInt n = matstruct->cprowIndices->size();
2947a0e72f99SJunchao Zhang         ScatterAdd<<<(n+255)/256,256,0,PetscDefaultCudaStream>>>(n,matstruct->cprowIndices->data().get(),cusparsestruct->workVector->data().get(),zarray);
2948a0e72f99SJunchao Zhang        #endif
294905035670SJunchao Zhang         cerr = WaitForCUDA();CHKERRCUDA(cerr);
2950958c4211Shannah_mairs         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
2951e6e9a74fSStefano Zampini       }
2952e6e9a74fSStefano Zampini     } else {
2953e6e9a74fSStefano Zampini       if (yy && yy != zz) {
2954e6e9a74fSStefano Zampini         ierr = VecAXPY_SeqCUDA(zz,1.0,yy);CHKERRQ(ierr); /* zz += yy */
2955e6e9a74fSStefano Zampini       }
2956e6e9a74fSStefano Zampini     }
2957e6e9a74fSStefano Zampini     ierr = VecCUDARestoreArrayRead(xx,(const PetscScalar**)&xarray);CHKERRQ(ierr);
2958213423ffSJunchao Zhang     if (yy == zz) {ierr = VecCUDARestoreArray(zz,&zarray);CHKERRQ(ierr);}
2959213423ffSJunchao Zhang     else {ierr = VecCUDARestoreArrayWrite(zz,&zarray);CHKERRQ(ierr);}
29609ae82921SPaul Mullowney   } catch(char *ex) {
29619ae82921SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"CUSPARSE error: %s", ex);
29629ae82921SPaul Mullowney   }
2963e6e9a74fSStefano Zampini   if (yy) {
2964958c4211Shannah_mairs     ierr = PetscLogGpuFlops(2.0*a->nz);CHKERRQ(ierr);
2965e6e9a74fSStefano Zampini   } else {
2966e6e9a74fSStefano Zampini     ierr = PetscLogGpuFlops(2.0*a->nz-a->nonzerorowcnt);CHKERRQ(ierr);
2967e6e9a74fSStefano Zampini   }
29689ae82921SPaul Mullowney   PetscFunctionReturn(0);
29699ae82921SPaul Mullowney }
29709ae82921SPaul Mullowney 
29716fa9248bSJed Brown static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
2972ca45077fSPaul Mullowney {
2973b175d8bbSPaul Mullowney   PetscErrorCode ierr;
29746e111a19SKarl Rupp 
2975ca45077fSPaul Mullowney   PetscFunctionBegin;
2976e6e9a74fSStefano Zampini   ierr = MatMultAddKernel_SeqAIJCUSPARSE(A,xx,yy,zz,PETSC_TRUE,PETSC_FALSE);CHKERRQ(ierr);
2977ca45077fSPaul Mullowney   PetscFunctionReturn(0);
2978ca45077fSPaul Mullowney }
2979ca45077fSPaul Mullowney 
29806fa9248bSJed Brown static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A,MatAssemblyType mode)
29819ae82921SPaul Mullowney {
29829ae82921SPaul Mullowney   PetscErrorCode              ierr;
2983a587d139SMark   PetscSplitCSRDataStructure  *d_mat = NULL;
29849ae82921SPaul Mullowney   PetscFunctionBegin;
2985bc3f50f2SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
29863fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
2987bc3f50f2SPaul Mullowney   }
29883fa6b06aSMark Adams   ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); // this does very little if assembled on GPU - call it?
29893fa6b06aSMark Adams   if (mode == MAT_FLUSH_ASSEMBLY || A->boundtocpu) PetscFunctionReturn(0);
2990a587d139SMark   if (d_mat) {
29913fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU;
29923fa6b06aSMark Adams   }
29933fa6b06aSMark Adams 
29949ae82921SPaul Mullowney   PetscFunctionReturn(0);
29959ae82921SPaul Mullowney }
29969ae82921SPaul Mullowney 
29979ae82921SPaul Mullowney /* --------------------------------------------------------------------------------*/
2998e057df02SPaul Mullowney /*@
29999ae82921SPaul Mullowney    MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
3000e057df02SPaul Mullowney    (the default parallel PETSc format). This matrix will ultimately pushed down
3001e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
3002e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
3003e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
3004e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
30059ae82921SPaul Mullowney 
3006d083f849SBarry Smith    Collective
30079ae82921SPaul Mullowney 
30089ae82921SPaul Mullowney    Input Parameters:
30099ae82921SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
30109ae82921SPaul Mullowney .  m - number of rows
30119ae82921SPaul Mullowney .  n - number of columns
30129ae82921SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
30139ae82921SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
30140298fd71SBarry Smith          (possibly different for each row) or NULL
30159ae82921SPaul Mullowney 
30169ae82921SPaul Mullowney    Output Parameter:
30179ae82921SPaul Mullowney .  A - the matrix
30189ae82921SPaul Mullowney 
30199ae82921SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
30209ae82921SPaul Mullowney    MatXXXXSetPreallocation() paradgm instead of this routine directly.
30219ae82921SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
30229ae82921SPaul Mullowney 
30239ae82921SPaul Mullowney    Notes:
30249ae82921SPaul Mullowney    If nnz is given then nz is ignored
30259ae82921SPaul Mullowney 
30269ae82921SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
30279ae82921SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
30289ae82921SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
30299ae82921SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
30309ae82921SPaul Mullowney 
30319ae82921SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
30320298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
30339ae82921SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
30349ae82921SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
30359ae82921SPaul Mullowney 
30369ae82921SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
30379ae82921SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
30389ae82921SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
30399ae82921SPaul Mullowney    reusing matrix information to achieve increased efficiency.
30409ae82921SPaul Mullowney 
30419ae82921SPaul Mullowney    Level: intermediate
30429ae82921SPaul Mullowney 
3043e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATSEQAIJCUSPARSE, MATAIJCUSPARSE
30449ae82921SPaul Mullowney @*/
30459ae82921SPaul Mullowney PetscErrorCode  MatCreateSeqAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
30469ae82921SPaul Mullowney {
30479ae82921SPaul Mullowney   PetscErrorCode ierr;
30489ae82921SPaul Mullowney 
30499ae82921SPaul Mullowney   PetscFunctionBegin;
30509ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
30519ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
30529ae82921SPaul Mullowney   ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
30539ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
30549ae82921SPaul Mullowney   PetscFunctionReturn(0);
30559ae82921SPaul Mullowney }
30569ae82921SPaul Mullowney 
30576fa9248bSJed Brown static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A)
30589ae82921SPaul Mullowney {
30599ae82921SPaul Mullowney   PetscErrorCode              ierr;
30603fa6b06aSMark Adams   PetscSplitCSRDataStructure  *d_mat = NULL;
3061ab25e6cbSDominic Meiser 
30629ae82921SPaul Mullowney   PetscFunctionBegin;
30639ae82921SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
30643fa6b06aSMark Adams     d_mat = ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat;
30653fa6b06aSMark Adams     ((Mat_SeqAIJCUSPARSE*)A->spptr)->deviceMat = NULL;
3066470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSE_Destroy((Mat_SeqAIJCUSPARSE**)&A->spptr);CHKERRQ(ierr);
30679ae82921SPaul Mullowney   } else {
3068470880abSPatrick Sanan     ierr = MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors**)&A->spptr);CHKERRQ(ierr);
3069aa372e3fSPaul Mullowney   }
30703fa6b06aSMark Adams   if (d_mat) {
30713fa6b06aSMark Adams     Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
30723fa6b06aSMark Adams     cudaError_t                err;
30733fa6b06aSMark Adams     PetscSplitCSRDataStructure h_mat;
30743fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
30753fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
30763fa6b06aSMark Adams     if (a->compressedrow.use) {
30773fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
30783fa6b06aSMark Adams     }
30793fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
30803fa6b06aSMark Adams   }
3081c215019aSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3082ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
3083ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3084ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3085fcdce8c4SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3086ccdfe979SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr);
30877e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
30887e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
30899ae82921SPaul Mullowney   ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
30909ae82921SPaul Mullowney   PetscFunctionReturn(0);
30919ae82921SPaul Mullowney }
30929ae82921SPaul Mullowney 
3093ccdfe979SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat,MatType,MatReuse,Mat*);
309495639643SRichard Tran Mills static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat,PetscBool);
30959ff858a8SKarl Rupp static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A,MatDuplicateOption cpvalues,Mat *B)
30969ff858a8SKarl Rupp {
30979ff858a8SKarl Rupp   PetscErrorCode ierr;
30989ff858a8SKarl Rupp 
30999ff858a8SKarl Rupp   PetscFunctionBegin;
31009ff858a8SKarl Rupp   ierr = MatDuplicate_SeqAIJ(A,cpvalues,B);CHKERRQ(ierr);
3101ccdfe979SStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,B);CHKERRQ(ierr);
31029ff858a8SKarl Rupp   PetscFunctionReturn(0);
31039ff858a8SKarl Rupp }
31049ff858a8SKarl Rupp 
3105039c6fbaSStefano Zampini static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y,PetscScalar a,Mat X,MatStructure str)
310695639643SRichard Tran Mills {
3107e6e9a74fSStefano Zampini   PetscErrorCode     ierr;
3108a587d139SMark   Mat_SeqAIJ         *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data;
3109039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cy;
3110039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cx;
3111039c6fbaSStefano Zampini   PetscScalar        *ay;
3112039c6fbaSStefano Zampini   const PetscScalar  *ax;
3113039c6fbaSStefano Zampini   CsrMatrix          *csry,*csrx;
3114039c6fbaSStefano Zampini   cudaError_t        cerr;
3115e6e9a74fSStefano Zampini 
311695639643SRichard Tran Mills   PetscFunctionBegin;
3117a49f1ed0SStefano Zampini   cy = (Mat_SeqAIJCUSPARSE*)Y->spptr;
3118a49f1ed0SStefano Zampini   cx = (Mat_SeqAIJCUSPARSE*)X->spptr;
3119039c6fbaSStefano Zampini   if (X->ops->axpy != Y->ops->axpy) {
3120a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3121a587d139SMark     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3122a587d139SMark     PetscFunctionReturn(0);
312395639643SRichard Tran Mills   }
3124039c6fbaSStefano Zampini   /* if we are here, it means both matrices are bound to GPU */
3125a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(Y);CHKERRQ(ierr);
3126a587d139SMark   ierr = MatSeqAIJCUSPARSECopyToGPU(X);CHKERRQ(ierr);
3127039c6fbaSStefano Zampini   if (cy->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)Y),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3128039c6fbaSStefano Zampini   if (cx->format != MAT_CUSPARSE_CSR) SETERRQ(PetscObjectComm((PetscObject)X),PETSC_ERR_PLIB,"only MAT_CUSPARSE_CSR supported");
3129039c6fbaSStefano Zampini   csry = (CsrMatrix*)cy->mat->mat;
3130039c6fbaSStefano Zampini   csrx = (CsrMatrix*)cx->mat->mat;
3131039c6fbaSStefano Zampini   /* see if we can turn this into a cublas axpy */
3132039c6fbaSStefano Zampini   if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) {
3133039c6fbaSStefano Zampini     bool eq = thrust::equal(thrust::device,csry->row_offsets->begin(),csry->row_offsets->end(),csrx->row_offsets->begin());
3134039c6fbaSStefano Zampini     if (eq) {
3135039c6fbaSStefano Zampini       eq = thrust::equal(thrust::device,csry->column_indices->begin(),csry->column_indices->end(),csrx->column_indices->begin());
3136039c6fbaSStefano Zampini     }
3137039c6fbaSStefano Zampini     if (eq) str = SAME_NONZERO_PATTERN;
3138039c6fbaSStefano Zampini   }
3139d2be01edSStefano Zampini   /* spgeam is buggy with one column */
3140d2be01edSStefano Zampini   if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN;
3141039c6fbaSStefano Zampini 
3142039c6fbaSStefano Zampini   if (str == SUBSET_NONZERO_PATTERN) {
3143039c6fbaSStefano Zampini     cusparseStatus_t stat;
3144039c6fbaSStefano Zampini     PetscScalar      b = 1.0;
3145039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3146039c6fbaSStefano Zampini     size_t           bufferSize;
3147039c6fbaSStefano Zampini     void             *buffer;
3148039c6fbaSStefano Zampini #endif
3149039c6fbaSStefano Zampini 
3150039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3151039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3152039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST);CHKERRCUSPARSE(stat);
3153039c6fbaSStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3154039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam_bufferSize(cy->handle,Y->rmap->n,Y->cmap->n,
3155039c6fbaSStefano Zampini                                           &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3156039c6fbaSStefano Zampini                                           &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3157039c6fbaSStefano Zampini                                              cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),&bufferSize);CHKERRCUSPARSE(stat);
3158039c6fbaSStefano Zampini     cerr = cudaMalloc(&buffer,bufferSize);CHKERRCUDA(cerr);
3159039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3160039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3161039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3162039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3163039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),buffer);CHKERRCUSPARSE(stat);
3164039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3165039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3166039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3167039c6fbaSStefano Zampini     cerr = cudaFree(buffer);CHKERRCUDA(cerr);
3168039c6fbaSStefano Zampini #else
3169039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3170039c6fbaSStefano Zampini     stat = cusparse_csr_spgeam(cy->handle,Y->rmap->n,Y->cmap->n,
3171039c6fbaSStefano Zampini                                &a,cx->mat->descr,x->nz,ax,csrx->row_offsets->data().get(),csrx->column_indices->data().get(),
3172039c6fbaSStefano Zampini                                &b,cy->mat->descr,y->nz,ay,csry->row_offsets->data().get(),csry->column_indices->data().get(),
3173039c6fbaSStefano Zampini                                   cy->mat->descr,      ay,csry->row_offsets->data().get(),csry->column_indices->data().get());CHKERRCUSPARSE(stat);
3174039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3175039c6fbaSStefano Zampini     ierr = PetscLogGpuFlops(x->nz + y->nz);CHKERRQ(ierr);
3176039c6fbaSStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3177039c6fbaSStefano Zampini #endif
3178039c6fbaSStefano Zampini     stat = cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE);CHKERRCUSPARSE(stat);
3179039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3180039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3181039c6fbaSStefano Zampini     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3182039c6fbaSStefano Zampini   } else if (str == SAME_NONZERO_PATTERN) {
3183a587d139SMark     cublasHandle_t cublasv2handle;
3184039c6fbaSStefano Zampini     cublasStatus_t berr;
3185a587d139SMark     PetscBLASInt   one = 1, bnz = 1;
3186039c6fbaSStefano Zampini 
3187039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArrayRead(X,&ax);CHKERRQ(ierr);
3188039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
3189a587d139SMark     ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
3190a587d139SMark     ierr = PetscBLASIntCast(x->nz,&bnz);CHKERRQ(ierr);
3191a587d139SMark     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3192039c6fbaSStefano Zampini     berr = cublasXaxpy(cublasv2handle,bnz,&a,ax,one,ay,one);CHKERRCUBLAS(berr);
3193039c6fbaSStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
3194a587d139SMark     ierr = PetscLogGpuFlops(2.0*bnz);CHKERRQ(ierr);
3195a587d139SMark     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3196039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArrayRead(X,&ax);CHKERRQ(ierr);
3197039c6fbaSStefano Zampini     ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
3198a587d139SMark     ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
3199039c6fbaSStefano Zampini   } else {
3200a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEInvalidateTranspose(Y,PETSC_FALSE);CHKERRQ(ierr);
3201d2be01edSStefano Zampini     ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr);
3202a587d139SMark   }
320395639643SRichard Tran Mills   PetscFunctionReturn(0);
320495639643SRichard Tran Mills }
320595639643SRichard Tran Mills 
320633c9ba73SStefano Zampini static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y,PetscScalar a)
320733c9ba73SStefano Zampini {
320833c9ba73SStefano Zampini   PetscErrorCode ierr;
320933c9ba73SStefano Zampini   Mat_SeqAIJ     *y = (Mat_SeqAIJ*)Y->data;
321033c9ba73SStefano Zampini   PetscScalar    *ay;
321133c9ba73SStefano Zampini   cudaError_t    cerr;
321233c9ba73SStefano Zampini   cublasHandle_t cublasv2handle;
321333c9ba73SStefano Zampini   cublasStatus_t berr;
321433c9ba73SStefano Zampini   PetscBLASInt   one = 1, bnz = 1;
321533c9ba73SStefano Zampini 
321633c9ba73SStefano Zampini   PetscFunctionBegin;
321733c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArray(Y,&ay);CHKERRQ(ierr);
321833c9ba73SStefano Zampini   ierr = PetscCUBLASGetHandle(&cublasv2handle);CHKERRQ(ierr);
321933c9ba73SStefano Zampini   ierr = PetscBLASIntCast(y->nz,&bnz);CHKERRQ(ierr);
322033c9ba73SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
322133c9ba73SStefano Zampini   berr = cublasXscal(cublasv2handle,bnz,&a,ay,one);CHKERRCUBLAS(berr);
322233c9ba73SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
322333c9ba73SStefano Zampini   ierr = PetscLogGpuFlops(bnz);CHKERRQ(ierr);
322433c9ba73SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
322533c9ba73SStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArray(Y,&ay);CHKERRQ(ierr);
322633c9ba73SStefano Zampini   ierr = MatSeqAIJInvalidateDiagonal(Y);CHKERRQ(ierr);
322733c9ba73SStefano Zampini   PetscFunctionReturn(0);
322833c9ba73SStefano Zampini }
322933c9ba73SStefano Zampini 
32303fa6b06aSMark Adams static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A)
32313fa6b06aSMark Adams {
32323fa6b06aSMark Adams   PetscErrorCode             ierr;
32337e8381f9SStefano Zampini   PetscBool                  both = PETSC_FALSE;
3234a587d139SMark   Mat_SeqAIJ                 *a = (Mat_SeqAIJ*)A->data;
32357e8381f9SStefano Zampini 
32363fa6b06aSMark Adams   PetscFunctionBegin;
32373fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
32383fa6b06aSMark Adams     Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE*)A->spptr;
32397e8381f9SStefano Zampini     if (spptr->mat) {
32407e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->mat->mat;
32417e8381f9SStefano Zampini       if (matrix->values) {
32427e8381f9SStefano Zampini         both = PETSC_TRUE;
32437e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32447e8381f9SStefano Zampini       }
32457e8381f9SStefano Zampini     }
32467e8381f9SStefano Zampini     if (spptr->matTranspose) {
32477e8381f9SStefano Zampini       CsrMatrix* matrix = (CsrMatrix*)spptr->matTranspose->mat;
32487e8381f9SStefano Zampini       if (matrix->values) {
32497e8381f9SStefano Zampini         thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
32507e8381f9SStefano Zampini       }
32517e8381f9SStefano Zampini     }
32523fa6b06aSMark Adams   }
3253a587d139SMark   //ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr);
3254a587d139SMark   ierr = PetscArrayzero(a->a,a->i[A->rmap->n]);CHKERRQ(ierr);
3255a587d139SMark   ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr);
32567e8381f9SStefano Zampini   if (both) A->offloadmask = PETSC_OFFLOAD_BOTH;
3257a587d139SMark   else A->offloadmask = PETSC_OFFLOAD_CPU;
32583fa6b06aSMark Adams 
32593fa6b06aSMark Adams   PetscFunctionReturn(0);
32603fa6b06aSMark Adams }
32613fa6b06aSMark Adams 
3262a587d139SMark static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A,PetscBool flg)
3263a587d139SMark {
3264a587d139SMark   Mat_SeqAIJ     *a = (Mat_SeqAIJ*)A->data;
3265a587d139SMark   PetscErrorCode ierr;
3266a587d139SMark 
3267a587d139SMark   PetscFunctionBegin;
3268a587d139SMark   if (A->factortype != MAT_FACTOR_NONE) PetscFunctionReturn(0);
3269a587d139SMark   if (flg) {
3270a587d139SMark     ierr = MatSeqAIJCUSPARSECopyFromGPU(A);CHKERRQ(ierr);
3271a587d139SMark 
327233c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJ;
3273a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJ;
3274a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJ;
3275a587d139SMark     A->ops->mult                      = MatMult_SeqAIJ;
3276a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJ;
3277a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJ;
3278a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJ;
3279a587d139SMark     A->ops->multhermitiantranspose    = NULL;
3280a587d139SMark     A->ops->multhermitiantransposeadd = NULL;
3281fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJ;
3282c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",NULL);CHKERRQ(ierr);
3283a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",NULL);CHKERRQ(ierr);
3284a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",NULL);CHKERRQ(ierr);
3285a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
3286a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
3287a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJ);CHKERRQ(ierr);
3288fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",NULL);CHKERRQ(ierr);
3289a587d139SMark   } else {
329033c9ba73SStefano Zampini     A->ops->scale                     = MatScale_SeqAIJCUSPARSE;
3291a587d139SMark     A->ops->axpy                      = MatAXPY_SeqAIJCUSPARSE;
3292a587d139SMark     A->ops->zeroentries               = MatZeroEntries_SeqAIJCUSPARSE;
3293a587d139SMark     A->ops->mult                      = MatMult_SeqAIJCUSPARSE;
3294a587d139SMark     A->ops->multadd                   = MatMultAdd_SeqAIJCUSPARSE;
3295a587d139SMark     A->ops->multtranspose             = MatMultTranspose_SeqAIJCUSPARSE;
3296a587d139SMark     A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJCUSPARSE;
3297a587d139SMark     A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJCUSPARSE;
3298a587d139SMark     A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE;
3299fcdce8c4SStefano Zampini     A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJCUSPARSE;
3300c215019aSStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJCopySubArray_C",MatSeqAIJCopySubArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3301a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3302a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqdense_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3303a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3304a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJCUSPARSE);CHKERRQ(ierr);
3305a587d139SMark     ierr = PetscObjectComposeFunction((PetscObject)A,"MatSeqAIJGetArray_C",MatSeqAIJGetArray_SeqAIJCUSPARSE);CHKERRQ(ierr);
3306fcdce8c4SStefano Zampini     ierr = PetscObjectComposeFunction((PetscObject)A,"MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C",MatProductSetFromOptions_SeqAIJCUSPARSE);CHKERRQ(ierr);
3307a587d139SMark   }
3308a587d139SMark   A->boundtocpu = flg;
3309a587d139SMark   a->inode.use = flg;
3310a587d139SMark   PetscFunctionReturn(0);
3311a587d139SMark }
3312a587d139SMark 
331349735bf3SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType mtype, MatReuse reuse, Mat* newmat)
33149ae82921SPaul Mullowney {
33159ae82921SPaul Mullowney   PetscErrorCode   ierr;
3316aa372e3fSPaul Mullowney   cusparseStatus_t stat;
331749735bf3SStefano Zampini   Mat              B;
33189ae82921SPaul Mullowney 
33199ae82921SPaul Mullowney   PetscFunctionBegin;
3320832b2c02SStefano Zampini   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); /* first use of CUSPARSE may be via MatConvert */
332149735bf3SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
332249735bf3SStefano Zampini     ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
332349735bf3SStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
332449735bf3SStefano Zampini     ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
332549735bf3SStefano Zampini   }
332649735bf3SStefano Zampini   B = *newmat;
332749735bf3SStefano Zampini 
332834136279SStefano Zampini   ierr = PetscFree(B->defaultvectype);CHKERRQ(ierr);
332934136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&B->defaultvectype);CHKERRQ(ierr);
333034136279SStefano Zampini 
333149735bf3SStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !B->spptr) {
33329ae82921SPaul Mullowney     if (B->factortype == MAT_FACTOR_NONE) {
3333e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSE *spptr;
3334e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3335e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3336a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
33371a2c6b5cSJunchao Zhang       spptr->format     = MAT_CUSPARSE_CSR;
3338d8132acaSStefano Zampini      #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3339d8132acaSStefano Zampini       spptr->spmvAlg    = CUSPARSE_CSRMV_ALG1;    /* default, since we only support csr */
3340d8132acaSStefano Zampini       spptr->spmmAlg    = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */
3341d8132acaSStefano Zampini       spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1;
3342d8132acaSStefano Zampini      #endif
33431a2c6b5cSJunchao Zhang       B->spptr = spptr;
33449ae82921SPaul Mullowney     } else {
3345e6e9a74fSStefano Zampini       Mat_SeqAIJCUSPARSETriFactors *spptr;
3346e6e9a74fSStefano Zampini 
3347e6e9a74fSStefano Zampini       ierr = PetscNew(&spptr);CHKERRQ(ierr);
3348e6e9a74fSStefano Zampini       stat = cusparseCreate(&spptr->handle);CHKERRCUSPARSE(stat);
3349a0e72f99SJunchao Zhang       stat = cusparseSetStream(spptr->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
3350e6e9a74fSStefano Zampini       B->spptr = spptr;
33519ae82921SPaul Mullowney     }
3352e6e9a74fSStefano Zampini     B->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
335349735bf3SStefano Zampini   }
3354693b0035SStefano Zampini   B->ops->assemblyend    = MatAssemblyEnd_SeqAIJCUSPARSE;
33559ae82921SPaul Mullowney   B->ops->destroy        = MatDestroy_SeqAIJCUSPARSE;
33561a2c6b5cSJunchao Zhang   B->ops->setoption      = MatSetOption_SeqAIJCUSPARSE;
33579ae82921SPaul Mullowney   B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE;
335895639643SRichard Tran Mills   B->ops->bindtocpu      = MatBindToCPU_SeqAIJCUSPARSE;
3359693b0035SStefano Zampini   B->ops->duplicate      = MatDuplicate_SeqAIJCUSPARSE;
33602205254eSKarl Rupp 
3361e6e9a74fSStefano Zampini   ierr = MatBindToCPU_SeqAIJCUSPARSE(B,PETSC_FALSE);CHKERRQ(ierr);
33629ae82921SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3363bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)B,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_SeqAIJCUSPARSE);CHKERRQ(ierr);
33649ae82921SPaul Mullowney   PetscFunctionReturn(0);
33659ae82921SPaul Mullowney }
33669ae82921SPaul Mullowney 
336702fe1965SBarry Smith PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B)
336802fe1965SBarry Smith {
336902fe1965SBarry Smith   PetscErrorCode ierr;
337002fe1965SBarry Smith 
337102fe1965SBarry Smith   PetscFunctionBegin;
337202fe1965SBarry Smith   ierr = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
33730ce8acdeSStefano Zampini   ierr = MatConvert_SeqAIJ_SeqAIJCUSPARSE(B,MATSEQAIJCUSPARSE,MAT_INPLACE_MATRIX,&B);CHKERRQ(ierr);
337402fe1965SBarry Smith   PetscFunctionReturn(0);
337502fe1965SBarry Smith }
337602fe1965SBarry Smith 
33773ca39a21SBarry Smith /*MC
3378e057df02SPaul Mullowney    MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices.
3379e057df02SPaul Mullowney 
3380e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
33812692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
33822692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3383e057df02SPaul Mullowney 
3384e057df02SPaul Mullowney    Options Database Keys:
3385e057df02SPaul Mullowney +  -mat_type aijcusparse - sets the matrix type to "seqaijcusparse" during a call to MatSetFromOptions()
3386aa372e3fSPaul Mullowney .  -mat_cusparse_storage_format csr - sets the storage format of matrices (for MatMult and factors in MatSolve) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3387a2b725a8SWilliam Gropp -  -mat_cusparse_mult_storage_format csr - sets the storage format of matrices (for MatMult) during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3388e057df02SPaul Mullowney 
3389e057df02SPaul Mullowney   Level: beginner
3390e057df02SPaul Mullowney 
33918468deeeSKarl Rupp .seealso: MatCreateSeqAIJCUSPARSE(), MATAIJCUSPARSE, MatCreateAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3392e057df02SPaul Mullowney M*/
33937f756511SDominic Meiser 
3394bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat,MatFactorType,Mat*);
33950f39cd5aSBarry Smith 
33963ca39a21SBarry Smith PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void)
339742c9c57cSBarry Smith {
339842c9c57cSBarry Smith   PetscErrorCode ierr;
339942c9c57cSBarry Smith 
340042c9c57cSBarry Smith   PetscFunctionBegin;
3401bddcd29dSMark Adams   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSEBAND, MATSEQAIJ, MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse_band);CHKERRQ(ierr);
34023ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_LU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34033ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_CHOLESKY,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34043ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ILU,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
34053ca39a21SBarry Smith   ierr = MatSolverTypeRegister(MATSOLVERCUSPARSE,MATSEQAIJCUSPARSE,MAT_FACTOR_ICC,MatGetFactor_seqaijcusparse_cusparse);CHKERRQ(ierr);
3406bddcd29dSMark Adams 
340742c9c57cSBarry Smith   PetscFunctionReturn(0);
340842c9c57cSBarry Smith }
340929b38603SBarry Smith 
3410470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat_SeqAIJCUSPARSE **cusparsestruct)
34117f756511SDominic Meiser {
3412e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
34137f756511SDominic Meiser   cusparseStatus_t stat;
34147f756511SDominic Meiser 
34157f756511SDominic Meiser   PetscFunctionBegin;
34167f756511SDominic Meiser   if (*cusparsestruct) {
3417e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->mat,(*cusparsestruct)->format);CHKERRQ(ierr);
3418e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*cusparsestruct)->matTranspose,(*cusparsestruct)->format);CHKERRQ(ierr);
34197f756511SDominic Meiser     delete (*cusparsestruct)->workVector;
342081902715SJunchao Zhang     delete (*cusparsestruct)->rowoffsets_gpu;
34217e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm;
34227e8381f9SStefano Zampini     delete (*cusparsestruct)->cooPerm_a;
3423a49f1ed0SStefano Zampini     delete (*cusparsestruct)->csr2csc_i;
34247e8381f9SStefano Zampini     if ((*cusparsestruct)->handle) {stat = cusparseDestroy((*cusparsestruct)->handle);CHKERRCUSPARSE(stat);}
3425e6e9a74fSStefano Zampini     ierr = PetscFree(*cusparsestruct);CHKERRQ(ierr);
34267f756511SDominic Meiser   }
34277f756511SDominic Meiser   PetscFunctionReturn(0);
34287f756511SDominic Meiser }
34297f756511SDominic Meiser 
34307f756511SDominic Meiser static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat)
34317f756511SDominic Meiser {
34327f756511SDominic Meiser   PetscFunctionBegin;
34337f756511SDominic Meiser   if (*mat) {
34347f756511SDominic Meiser     delete (*mat)->values;
34357f756511SDominic Meiser     delete (*mat)->column_indices;
34367f756511SDominic Meiser     delete (*mat)->row_offsets;
34377f756511SDominic Meiser     delete *mat;
34387f756511SDominic Meiser     *mat = 0;
34397f756511SDominic Meiser   }
34407f756511SDominic Meiser   PetscFunctionReturn(0);
34417f756511SDominic Meiser }
34427f756511SDominic Meiser 
3443470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor)
34447f756511SDominic Meiser {
34457f756511SDominic Meiser   cusparseStatus_t stat;
34467f756511SDominic Meiser   PetscErrorCode   ierr;
34477f756511SDominic Meiser 
34487f756511SDominic Meiser   PetscFunctionBegin;
34497f756511SDominic Meiser   if (*trifactor) {
345057d48284SJunchao Zhang     if ((*trifactor)->descr) { stat = cusparseDestroyMatDescr((*trifactor)->descr);CHKERRCUSPARSE(stat); }
3451afb2bd1cSJunchao Zhang     if ((*trifactor)->solveInfo) { stat = cusparse_destroy_analysis_info((*trifactor)->solveInfo);CHKERRCUSPARSE(stat); }
34527f756511SDominic Meiser     ierr = CsrMatrix_Destroy(&(*trifactor)->csrMat);CHKERRQ(ierr);
34531b0a6780SStefano Zampini     if ((*trifactor)->solveBuffer)   {cudaError_t cerr = cudaFree((*trifactor)->solveBuffer);CHKERRCUDA(cerr);}
34542cbc15d9SMark     if ((*trifactor)->AA_h)   {cudaError_t cerr = cudaFreeHost((*trifactor)->AA_h);CHKERRCUDA(cerr);}
3455afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
34561b0a6780SStefano Zampini     if ((*trifactor)->csr2cscBuffer) {cudaError_t cerr = cudaFree((*trifactor)->csr2cscBuffer);CHKERRCUDA(cerr);}
3457afb2bd1cSJunchao Zhang    #endif
3458da79fbbcSStefano Zampini     ierr = PetscFree(*trifactor);CHKERRQ(ierr);
34597f756511SDominic Meiser   }
34607f756511SDominic Meiser   PetscFunctionReturn(0);
34617f756511SDominic Meiser }
34627f756511SDominic Meiser 
3463470880abSPatrick Sanan static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct,MatCUSPARSEStorageFormat format)
34647f756511SDominic Meiser {
34657f756511SDominic Meiser   CsrMatrix        *mat;
34667f756511SDominic Meiser   cusparseStatus_t stat;
34677f756511SDominic Meiser   cudaError_t      err;
34687f756511SDominic Meiser 
34697f756511SDominic Meiser   PetscFunctionBegin;
34707f756511SDominic Meiser   if (*matstruct) {
34717f756511SDominic Meiser     if ((*matstruct)->mat) {
34727f756511SDominic Meiser       if (format==MAT_CUSPARSE_ELL || format==MAT_CUSPARSE_HYB) {
3473afb2bd1cSJunchao Zhang        #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3474afb2bd1cSJunchao Zhang         SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0");
3475afb2bd1cSJunchao Zhang        #else
34767f756511SDominic Meiser         cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat;
347757d48284SJunchao Zhang         stat = cusparseDestroyHybMat(hybMat);CHKERRCUSPARSE(stat);
3478afb2bd1cSJunchao Zhang        #endif
34797f756511SDominic Meiser       } else {
34807f756511SDominic Meiser         mat = (CsrMatrix*)(*matstruct)->mat;
34817f756511SDominic Meiser         CsrMatrix_Destroy(&mat);
34827f756511SDominic Meiser       }
34837f756511SDominic Meiser     }
348457d48284SJunchao Zhang     if ((*matstruct)->descr) { stat = cusparseDestroyMatDescr((*matstruct)->descr);CHKERRCUSPARSE(stat); }
34857f756511SDominic Meiser     delete (*matstruct)->cprowIndices;
3486afb2bd1cSJunchao Zhang     if ((*matstruct)->alpha_one) { err=cudaFree((*matstruct)->alpha_one);CHKERRCUDA(err); }
34877656d835SStefano Zampini     if ((*matstruct)->beta_zero) { err=cudaFree((*matstruct)->beta_zero);CHKERRCUDA(err); }
34887656d835SStefano Zampini     if ((*matstruct)->beta_one)  { err=cudaFree((*matstruct)->beta_one);CHKERRCUDA(err); }
3489afb2bd1cSJunchao Zhang 
3490afb2bd1cSJunchao Zhang    #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
3491afb2bd1cSJunchao Zhang     Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct;
3492afb2bd1cSJunchao Zhang     if (mdata->matDescr) {stat = cusparseDestroySpMat(mdata->matDescr);CHKERRCUSPARSE(stat);}
3493afb2bd1cSJunchao Zhang     for (int i=0; i<3; i++) {
3494afb2bd1cSJunchao Zhang       if (mdata->cuSpMV[i].initialized) {
3495afb2bd1cSJunchao Zhang         err  = cudaFree(mdata->cuSpMV[i].spmvBuffer);CHKERRCUDA(err);
3496afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr);CHKERRCUSPARSE(stat);
3497afb2bd1cSJunchao Zhang         stat = cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr);CHKERRCUSPARSE(stat);
3498afb2bd1cSJunchao Zhang       }
3499afb2bd1cSJunchao Zhang     }
3500afb2bd1cSJunchao Zhang    #endif
35017f756511SDominic Meiser     delete *matstruct;
35027e8381f9SStefano Zampini     *matstruct = NULL;
35037f756511SDominic Meiser   }
35047f756511SDominic Meiser   PetscFunctionReturn(0);
35057f756511SDominic Meiser }
35067f756511SDominic Meiser 
3507ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors** trifactors)
35087f756511SDominic Meiser {
3509e6e9a74fSStefano Zampini   PetscErrorCode ierr;
3510e6e9a74fSStefano Zampini 
35117f756511SDominic Meiser   PetscFunctionBegin;
35127f756511SDominic Meiser   if (*trifactors) {
3513e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtr);CHKERRQ(ierr);
3514e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtr);CHKERRQ(ierr);
3515e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->loTriFactorPtrTranspose);CHKERRQ(ierr);
3516e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&(*trifactors)->upTriFactorPtrTranspose);CHKERRQ(ierr);
35177f756511SDominic Meiser     delete (*trifactors)->rpermIndices;
35187f756511SDominic Meiser     delete (*trifactors)->cpermIndices;
35197f756511SDominic Meiser     delete (*trifactors)->workVector;
35207e8381f9SStefano Zampini     (*trifactors)->rpermIndices = NULL;
35217e8381f9SStefano Zampini     (*trifactors)->cpermIndices = NULL;
35227e8381f9SStefano Zampini     (*trifactors)->workVector = NULL;
3523bddcd29dSMark Adams     if ((*trifactors)->a_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->a_band_d);CHKERRCUDA(cerr);}
3524bddcd29dSMark Adams     if ((*trifactors)->i_band_d)   {cudaError_t cerr = cudaFree((*trifactors)->i_band_d);CHKERRCUDA(cerr);}
3525ccdfe979SStefano Zampini   }
3526ccdfe979SStefano Zampini   PetscFunctionReturn(0);
3527ccdfe979SStefano Zampini }
3528ccdfe979SStefano Zampini 
3529ccdfe979SStefano Zampini static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors** trifactors)
3530ccdfe979SStefano Zampini {
3531e6e9a74fSStefano Zampini   PetscErrorCode   ierr;
3532ccdfe979SStefano Zampini   cusparseHandle_t handle;
3533ccdfe979SStefano Zampini   cusparseStatus_t stat;
3534ccdfe979SStefano Zampini 
3535ccdfe979SStefano Zampini   PetscFunctionBegin;
3536ccdfe979SStefano Zampini   if (*trifactors) {
3537e6e9a74fSStefano Zampini     ierr = MatSeqAIJCUSPARSETriFactors_Reset(trifactors);CHKERRQ(ierr);
35387f756511SDominic Meiser     if (handle = (*trifactors)->handle) {
353957d48284SJunchao Zhang       stat = cusparseDestroy(handle);CHKERRCUSPARSE(stat);
35407f756511SDominic Meiser     }
3541e6e9a74fSStefano Zampini     ierr = PetscFree(*trifactors);CHKERRQ(ierr);
35427f756511SDominic Meiser   }
35437f756511SDominic Meiser   PetscFunctionReturn(0);
35447f756511SDominic Meiser }
35457e8381f9SStefano Zampini 
35467e8381f9SStefano Zampini struct IJCompare
35477e8381f9SStefano Zampini {
35487e8381f9SStefano Zampini   __host__ __device__
35497e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35507e8381f9SStefano Zampini   {
35517e8381f9SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
35527e8381f9SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
35537e8381f9SStefano Zampini     return false;
35547e8381f9SStefano Zampini   }
35557e8381f9SStefano Zampini };
35567e8381f9SStefano Zampini 
35577e8381f9SStefano Zampini struct IJEqual
35587e8381f9SStefano Zampini {
35597e8381f9SStefano Zampini   __host__ __device__
35607e8381f9SStefano Zampini   inline bool operator() (const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2)
35617e8381f9SStefano Zampini   {
35627e8381f9SStefano Zampini     if (t1.get<0>() != t2.get<0>() || t1.get<1>() != t2.get<1>()) return false;
35637e8381f9SStefano Zampini     return true;
35647e8381f9SStefano Zampini   }
35657e8381f9SStefano Zampini };
35667e8381f9SStefano Zampini 
35677e8381f9SStefano Zampini struct IJDiff
35687e8381f9SStefano Zampini {
35697e8381f9SStefano Zampini   __host__ __device__
35707e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35717e8381f9SStefano Zampini   {
35727e8381f9SStefano Zampini     return t1 == t2 ? 0 : 1;
35737e8381f9SStefano Zampini   }
35747e8381f9SStefano Zampini };
35757e8381f9SStefano Zampini 
35767e8381f9SStefano Zampini struct IJSum
35777e8381f9SStefano Zampini {
35787e8381f9SStefano Zampini   __host__ __device__
35797e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &t1, const PetscInt &t2)
35807e8381f9SStefano Zampini   {
35817e8381f9SStefano Zampini     return t1||t2;
35827e8381f9SStefano Zampini   }
35837e8381f9SStefano Zampini };
35847e8381f9SStefano Zampini 
35857e8381f9SStefano Zampini #include <thrust/iterator/discard_iterator.h>
3586e61fc153SStefano Zampini PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
35877e8381f9SStefano Zampini {
35887e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE                    *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3589fcdce8c4SStefano Zampini   Mat_SeqAIJ                            *a = (Mat_SeqAIJ*)A->data;
3590bfcc3627SStefano Zampini   THRUSTARRAY                           *cooPerm_v = NULL;
359108391a17SStefano Zampini   thrust::device_ptr<const PetscScalar> d_v;
35927e8381f9SStefano Zampini   CsrMatrix                             *matrix;
35937e8381f9SStefano Zampini   PetscErrorCode                        ierr;
35947e8381f9SStefano Zampini   cudaError_t                           cerr;
35957e8381f9SStefano Zampini   PetscInt                              n;
35967e8381f9SStefano Zampini 
35977e8381f9SStefano Zampini   PetscFunctionBegin;
35987e8381f9SStefano Zampini   if (!cusp) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE struct");
35997e8381f9SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUSPARSE CsrMatrix");
36007e8381f9SStefano Zampini   if (!cusp->cooPerm) {
36017e8381f9SStefano Zampini     ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
36027e8381f9SStefano Zampini     ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
36037e8381f9SStefano Zampini     PetscFunctionReturn(0);
36047e8381f9SStefano Zampini   }
36057e8381f9SStefano Zampini   matrix = (CsrMatrix*)cusp->mat->mat;
36067e8381f9SStefano Zampini   if (!matrix->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3607e61fc153SStefano Zampini   if (!v) {
3608e61fc153SStefano Zampini     if (imode == INSERT_VALUES) thrust::fill(thrust::device,matrix->values->begin(),matrix->values->end(),0.);
3609e61fc153SStefano Zampini     goto finalize;
36107e8381f9SStefano Zampini   }
3611e61fc153SStefano Zampini   n = cusp->cooPerm->size();
361208391a17SStefano Zampini   if (isCudaMem(v)) {
361308391a17SStefano Zampini     d_v = thrust::device_pointer_cast(v);
361408391a17SStefano Zampini   } else {
3615e61fc153SStefano Zampini     cooPerm_v = new THRUSTARRAY(n);
3616e61fc153SStefano Zampini     cooPerm_v->assign(v,v+n);
361708391a17SStefano Zampini     d_v = cooPerm_v->data();
3618e61fc153SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
361908391a17SStefano Zampini   }
3620bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
3621e61fc153SStefano Zampini   if (imode == ADD_VALUES) { /* ADD VALUES means add to existing ones */
36227e8381f9SStefano Zampini     if (cusp->cooPerm_a) {
3623bfcc3627SStefano Zampini       THRUSTARRAY *cooPerm_w = new THRUSTARRAY(matrix->values->size());
362408391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3625e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),cooPerm_w->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
3626e61fc153SStefano Zampini       thrust::transform(cooPerm_w->begin(),cooPerm_w->end(),matrix->values->begin(),matrix->values->begin(),thrust::plus<PetscScalar>());
3627e61fc153SStefano Zampini       delete cooPerm_w;
36287e8381f9SStefano Zampini     } else {
362908391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36307e8381f9SStefano Zampini                                                                 matrix->values->begin()));
363108391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36327e8381f9SStefano Zampini                                                                 matrix->values->end()));
36337e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAPlusEquals());
36347e8381f9SStefano Zampini     }
36357e8381f9SStefano Zampini   } else {
3636e61fc153SStefano Zampini     if (cusp->cooPerm_a) { /* repeated entries in COO, with INSERT_VALUES -> reduce */
363708391a17SStefano Zampini       auto vbit = thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin());
3638e61fc153SStefano Zampini       thrust::reduce_by_key(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),vbit,thrust::make_discard_iterator(),matrix->values->begin(),thrust::equal_to<PetscInt>(),thrust::plus<PetscScalar>());
36397e8381f9SStefano Zampini     } else {
364008391a17SStefano Zampini       auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->begin()),
36417e8381f9SStefano Zampini                                                                 matrix->values->begin()));
364208391a17SStefano Zampini       auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->cooPerm->end()),
36437e8381f9SStefano Zampini                                                                 matrix->values->end()));
36447e8381f9SStefano Zampini       thrust::for_each(zibit,zieit,VecCUDAEquals());
36457e8381f9SStefano Zampini     }
36467e8381f9SStefano Zampini   }
36477e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
3648bfcc3627SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
3649e61fc153SStefano Zampini finalize:
3650e61fc153SStefano Zampini   delete cooPerm_v;
36517e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3652e61fc153SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3653fcdce8c4SStefano Zampini   /* shorter version of MatAssemblyEnd_SeqAIJ */
3654fcdce8c4SStefano Zampini   ierr = PetscInfo3(A,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",A->rmap->n,A->cmap->n,a->nz);CHKERRQ(ierr);
3655fcdce8c4SStefano Zampini   ierr = PetscInfo(A,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr);
3656fcdce8c4SStefano Zampini   ierr = PetscInfo1(A,"Maximum nonzeros in any row is %D\n",a->rmax);CHKERRQ(ierr);
3657fcdce8c4SStefano Zampini   a->reallocs         = 0;
3658fcdce8c4SStefano Zampini   A->info.mallocs    += 0;
3659fcdce8c4SStefano Zampini   A->info.nz_unneeded = 0;
3660fcdce8c4SStefano Zampini   A->assembled = A->was_assembled = PETSC_TRUE;
3661fcdce8c4SStefano Zampini   A->num_ass++;
36627e8381f9SStefano Zampini   PetscFunctionReturn(0);
36637e8381f9SStefano Zampini }
36647e8381f9SStefano Zampini 
3665a49f1ed0SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy)
3666a49f1ed0SStefano Zampini {
3667a49f1ed0SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3668a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3669a49f1ed0SStefano Zampini 
3670a49f1ed0SStefano Zampini   PetscFunctionBegin;
3671a49f1ed0SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3672a49f1ed0SStefano Zampini   if (!cusp) PetscFunctionReturn(0);
3673a49f1ed0SStefano Zampini   if (destroy) {
3674a49f1ed0SStefano Zampini     ierr = MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose,cusp->format);CHKERRQ(ierr);
3675a49f1ed0SStefano Zampini     delete cusp->csr2csc_i;
3676a49f1ed0SStefano Zampini     cusp->csr2csc_i = NULL;
3677a49f1ed0SStefano Zampini   }
36781a2c6b5cSJunchao Zhang   A->transupdated = PETSC_FALSE;
3679a49f1ed0SStefano Zampini   PetscFunctionReturn(0);
3680a49f1ed0SStefano Zampini }
3681a49f1ed0SStefano Zampini 
36827e8381f9SStefano Zampini #include <thrust/binary_search.h>
3683e61fc153SStefano Zampini PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
36847e8381f9SStefano Zampini {
36857e8381f9SStefano Zampini   PetscErrorCode     ierr;
36867e8381f9SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
36877e8381f9SStefano Zampini   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data;
36887e8381f9SStefano Zampini   PetscInt           cooPerm_n, nzr = 0;
36897e8381f9SStefano Zampini   cudaError_t        cerr;
36907e8381f9SStefano Zampini 
36917e8381f9SStefano Zampini   PetscFunctionBegin;
36927e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->rmap);CHKERRQ(ierr);
36937e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(A->cmap);CHKERRQ(ierr);
36947e8381f9SStefano Zampini   cooPerm_n = cusp->cooPerm ? cusp->cooPerm->size() : 0;
36957e8381f9SStefano Zampini   if (n != cooPerm_n) {
36967e8381f9SStefano Zampini     delete cusp->cooPerm;
36977e8381f9SStefano Zampini     delete cusp->cooPerm_a;
36987e8381f9SStefano Zampini     cusp->cooPerm = NULL;
36997e8381f9SStefano Zampini     cusp->cooPerm_a = NULL;
37007e8381f9SStefano Zampini   }
37017e8381f9SStefano Zampini   if (n) {
37027e8381f9SStefano Zampini     THRUSTINTARRAY d_i(n);
37037e8381f9SStefano Zampini     THRUSTINTARRAY d_j(n);
37047e8381f9SStefano Zampini     THRUSTINTARRAY ii(A->rmap->n);
37057e8381f9SStefano Zampini 
37067e8381f9SStefano Zampini     if (!cusp->cooPerm)   { cusp->cooPerm   = new THRUSTINTARRAY(n); }
37077e8381f9SStefano Zampini     if (!cusp->cooPerm_a) { cusp->cooPerm_a = new THRUSTINTARRAY(n); }
37087e8381f9SStefano Zampini 
37097e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
37107e8381f9SStefano Zampini     d_i.assign(coo_i,coo_i+n);
37117e8381f9SStefano Zampini     d_j.assign(coo_j,coo_j+n);
37127e8381f9SStefano Zampini     auto fkey = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin()));
37137e8381f9SStefano Zampini     auto ekey = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end()));
37147e8381f9SStefano Zampini 
371508391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
37167e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->cooPerm->begin(), cusp->cooPerm->end(), 0);
37177e8381f9SStefano Zampini     thrust::sort_by_key(fkey, ekey, cusp->cooPerm->begin(), IJCompare());
37187e8381f9SStefano Zampini     *cusp->cooPerm_a = d_i;
37197e8381f9SStefano Zampini     THRUSTINTARRAY w = d_j;
37207e8381f9SStefano Zampini 
37217e8381f9SStefano Zampini     auto nekey = thrust::unique(fkey, ekey, IJEqual());
37227e8381f9SStefano Zampini     if (nekey == ekey) { /* all entries are unique */
37237e8381f9SStefano Zampini       delete cusp->cooPerm_a;
37247e8381f9SStefano Zampini       cusp->cooPerm_a = NULL;
37257e8381f9SStefano Zampini     } else { /* I couldn't come up with a more elegant algorithm */
37267e8381f9SStefano Zampini       adjacent_difference(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),IJDiff());
37277e8381f9SStefano Zampini       adjacent_difference(w.begin(),w.end(),w.begin(),IJDiff());
37287e8381f9SStefano Zampini       (*cusp->cooPerm_a)[0] = 0;
37297e8381f9SStefano Zampini       w[0] = 0;
37307e8381f9SStefano Zampini       thrust::transform(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),w.begin(),cusp->cooPerm_a->begin(),IJSum());
37317e8381f9SStefano Zampini       thrust::inclusive_scan(cusp->cooPerm_a->begin(),cusp->cooPerm_a->end(),cusp->cooPerm_a->begin(),thrust::plus<PetscInt>());
37327e8381f9SStefano Zampini     }
37337e8381f9SStefano Zampini     thrust::counting_iterator<PetscInt> search_begin(0);
37347e8381f9SStefano Zampini     thrust::upper_bound(d_i.begin(), nekey.get_iterator_tuple().get<0>(),
37357e8381f9SStefano Zampini                         search_begin, search_begin + A->rmap->n,
37367e8381f9SStefano Zampini                         ii.begin());
373708391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
373808391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
37397e8381f9SStefano Zampini 
37407e8381f9SStefano Zampini     ierr = MatSeqXAIJFreeAIJ(A,&a->a,&a->j,&a->i);CHKERRQ(ierr);
37417e8381f9SStefano Zampini     a->singlemalloc = PETSC_FALSE;
37427e8381f9SStefano Zampini     a->free_a       = PETSC_TRUE;
37437e8381f9SStefano Zampini     a->free_ij      = PETSC_TRUE;
37447e8381f9SStefano Zampini     ierr = PetscMalloc1(A->rmap->n+1,&a->i);CHKERRQ(ierr);
37457e8381f9SStefano Zampini     a->i[0] = 0;
37467e8381f9SStefano Zampini     cerr = cudaMemcpy(a->i+1,ii.data().get(),A->rmap->n*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37477e8381f9SStefano Zampini     a->nz = a->maxnz = a->i[A->rmap->n];
3748fcdce8c4SStefano Zampini     a->rmax = 0;
37497e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->a);CHKERRQ(ierr);
37507e8381f9SStefano Zampini     ierr = PetscMalloc1(a->nz,&a->j);CHKERRQ(ierr);
37517e8381f9SStefano Zampini     cerr = cudaMemcpy(a->j,d_j.data().get(),a->nz*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
37527e8381f9SStefano Zampini     if (!a->ilen) { ierr = PetscMalloc1(A->rmap->n,&a->ilen);CHKERRQ(ierr); }
37537e8381f9SStefano Zampini     if (!a->imax) { ierr = PetscMalloc1(A->rmap->n,&a->imax);CHKERRQ(ierr); }
37547e8381f9SStefano Zampini     for (PetscInt i = 0; i < A->rmap->n; i++) {
37557e8381f9SStefano Zampini       const PetscInt nnzr = a->i[i+1] - a->i[i];
37567e8381f9SStefano Zampini       nzr += (PetscInt)!!(nnzr);
37577e8381f9SStefano Zampini       a->ilen[i] = a->imax[i] = nnzr;
3758fcdce8c4SStefano Zampini       a->rmax = PetscMax(a->rmax,nnzr);
37597e8381f9SStefano Zampini     }
3760fcdce8c4SStefano Zampini     a->nonzerorowcnt = nzr;
37617e8381f9SStefano Zampini     A->preallocated = PETSC_TRUE;
37627e8381f9SStefano Zampini     ierr = PetscLogGpuToCpu((A->rmap->n+a->nz)*sizeof(PetscInt));CHKERRQ(ierr);
3763fcdce8c4SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(A);CHKERRQ(ierr);
37647e8381f9SStefano Zampini   } else {
37657e8381f9SStefano Zampini     ierr = MatSeqAIJSetPreallocation(A,0,NULL);CHKERRQ(ierr);
37667e8381f9SStefano Zampini   }
3767e61fc153SStefano Zampini   ierr = MatSetOption(A,MAT_NEW_NONZERO_ALLOCATION_ERR,PETSC_TRUE);CHKERRQ(ierr);
37687e8381f9SStefano Zampini 
37697e8381f9SStefano Zampini   /* We want to allocate the CUSPARSE struct for matvec now.
3770e61fc153SStefano Zampini      The code is so convoluted now that I prefer to copy zeros */
3771e61fc153SStefano Zampini   ierr = PetscArrayzero(a->a,a->nz);CHKERRQ(ierr);
37727e8381f9SStefano Zampini   ierr = MatCheckCompressedRow(A,nzr,&a->compressedrow,a->i,A->rmap->n,0.6);CHKERRQ(ierr);
37737e8381f9SStefano Zampini   A->offloadmask = PETSC_OFFLOAD_CPU;
37747e8381f9SStefano Zampini   A->nonzerostate++;
37757e8381f9SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3776a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_TRUE);CHKERRQ(ierr);
37777e8381f9SStefano Zampini 
37787e8381f9SStefano Zampini   A->assembled = PETSC_FALSE;
37797e8381f9SStefano Zampini   A->was_assembled = PETSC_FALSE;
37807e8381f9SStefano Zampini   PetscFunctionReturn(0);
37817e8381f9SStefano Zampini }
3782ed502f03SStefano Zampini 
3783ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar** a)
3784ed502f03SStefano Zampini {
3785ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3786ed502f03SStefano Zampini   CsrMatrix          *csr;
3787ed502f03SStefano Zampini   PetscErrorCode     ierr;
3788ed502f03SStefano Zampini 
3789ed502f03SStefano Zampini   PetscFunctionBegin;
3790ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3791ed502f03SStefano Zampini   PetscValidPointer(a,2);
3792ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3793ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3794ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
379533c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3796ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3797ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3798ed502f03SStefano Zampini   *a = csr->values->data().get();
3799ed502f03SStefano Zampini   PetscFunctionReturn(0);
3800ed502f03SStefano Zampini }
3801ed502f03SStefano Zampini 
3802ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar** a)
3803ed502f03SStefano Zampini {
3804ed502f03SStefano Zampini   PetscFunctionBegin;
3805ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3806ed502f03SStefano Zampini   PetscValidPointer(a,2);
3807ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3808ed502f03SStefano Zampini   *a = NULL;
3809ed502f03SStefano Zampini   PetscFunctionReturn(0);
3810ed502f03SStefano Zampini }
3811ed502f03SStefano Zampini 
3812039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar** a)
3813039c6fbaSStefano Zampini {
3814039c6fbaSStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3815039c6fbaSStefano Zampini   CsrMatrix          *csr;
3816039c6fbaSStefano Zampini   PetscErrorCode     ierr;
3817039c6fbaSStefano Zampini 
3818039c6fbaSStefano Zampini   PetscFunctionBegin;
3819039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3820039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3821039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3822039c6fbaSStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3823039c6fbaSStefano Zampini   ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
382433c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3825039c6fbaSStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3826039c6fbaSStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3827039c6fbaSStefano Zampini   *a = csr->values->data().get();
3828039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3829a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3830039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3831039c6fbaSStefano Zampini }
3832039c6fbaSStefano Zampini 
3833039c6fbaSStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar** a)
3834039c6fbaSStefano Zampini {
3835039c6fbaSStefano Zampini   PetscErrorCode ierr;
3836039c6fbaSStefano Zampini 
3837039c6fbaSStefano Zampini   PetscFunctionBegin;
3838039c6fbaSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3839039c6fbaSStefano Zampini   PetscValidPointer(a,2);
3840039c6fbaSStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3841039c6fbaSStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3842039c6fbaSStefano Zampini   *a = NULL;
3843039c6fbaSStefano Zampini   PetscFunctionReturn(0);
3844039c6fbaSStefano Zampini }
3845039c6fbaSStefano Zampini 
3846ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar** a)
3847ed502f03SStefano Zampini {
3848ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE*)A->spptr;
3849ed502f03SStefano Zampini   CsrMatrix          *csr;
3850a49f1ed0SStefano Zampini   PetscErrorCode     ierr;
3851ed502f03SStefano Zampini 
3852ed502f03SStefano Zampini   PetscFunctionBegin;
3853ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3854ed502f03SStefano Zampini   PetscValidPointer(a,2);
3855ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3856ed502f03SStefano Zampini   if (cusp->format == MAT_CUSPARSE_ELL || cusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
385733c9ba73SStefano Zampini   if (!cusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3858ed502f03SStefano Zampini   csr = (CsrMatrix*)cusp->mat->mat;
3859ed502f03SStefano Zampini   if (!csr->values) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing CUDA memory");
3860ed502f03SStefano Zampini   *a = csr->values->data().get();
3861039c6fbaSStefano Zampini   A->offloadmask = PETSC_OFFLOAD_GPU;
3862a49f1ed0SStefano Zampini   ierr = MatSeqAIJCUSPARSEInvalidateTranspose(A,PETSC_FALSE);CHKERRQ(ierr);
3863ed502f03SStefano Zampini   PetscFunctionReturn(0);
3864ed502f03SStefano Zampini }
3865ed502f03SStefano Zampini 
3866ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar** a)
3867ed502f03SStefano Zampini {
3868ed502f03SStefano Zampini   PetscErrorCode ierr;
3869ed502f03SStefano Zampini 
3870ed502f03SStefano Zampini   PetscFunctionBegin;
3871ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3872ed502f03SStefano Zampini   PetscValidPointer(a,2);
3873ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3874ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
3875ed502f03SStefano Zampini   *a = NULL;
3876ed502f03SStefano Zampini   PetscFunctionReturn(0);
3877ed502f03SStefano Zampini }
3878ed502f03SStefano Zampini 
3879ed502f03SStefano Zampini struct IJCompare4
3880ed502f03SStefano Zampini {
3881ed502f03SStefano Zampini   __host__ __device__
38822ed87e7eSStefano Zampini   inline bool operator() (const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2)
3883ed502f03SStefano Zampini   {
3884ed502f03SStefano Zampini     if (t1.get<0>() < t2.get<0>()) return true;
3885ed502f03SStefano Zampini     if (t1.get<0>() == t2.get<0>()) return t1.get<1>() < t2.get<1>();
3886ed502f03SStefano Zampini     return false;
3887ed502f03SStefano Zampini   }
3888ed502f03SStefano Zampini };
3889ed502f03SStefano Zampini 
38908909a122SStefano Zampini struct Shift
38918909a122SStefano Zampini {
3892ed502f03SStefano Zampini   int _shift;
3893ed502f03SStefano Zampini 
3894ed502f03SStefano Zampini   Shift(int shift) : _shift(shift) {}
3895ed502f03SStefano Zampini   __host__ __device__
3896ed502f03SStefano Zampini   inline int operator() (const int &c)
3897ed502f03SStefano Zampini   {
3898ed502f03SStefano Zampini     return c + _shift;
3899ed502f03SStefano Zampini   }
3900ed502f03SStefano Zampini };
3901ed502f03SStefano Zampini 
3902ed502f03SStefano Zampini /* merges to SeqAIJCUSPARSE matrices, [A';B']' operation in matlab notation */
3903ed502f03SStefano Zampini PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C)
3904ed502f03SStefano Zampini {
3905ed502f03SStefano Zampini   PetscErrorCode               ierr;
3906ed502f03SStefano Zampini   Mat_SeqAIJ                   *a = (Mat_SeqAIJ*)A->data, *b = (Mat_SeqAIJ*)B->data, *c;
3907ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSE           *Acusp = (Mat_SeqAIJCUSPARSE*)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE*)B->spptr, *Ccusp;
3908ed502f03SStefano Zampini   Mat_SeqAIJCUSPARSEMultStruct *Cmat;
3909ed502f03SStefano Zampini   CsrMatrix                    *Acsr,*Bcsr,*Ccsr;
3910ed502f03SStefano Zampini   PetscInt                     Annz,Bnnz;
3911ed502f03SStefano Zampini   cusparseStatus_t             stat;
3912ed502f03SStefano Zampini   PetscInt                     i,m,n,zero = 0;
3913ed502f03SStefano Zampini   cudaError_t                  cerr;
3914ed502f03SStefano Zampini 
3915ed502f03SStefano Zampini   PetscFunctionBegin;
3916ed502f03SStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
3917ed502f03SStefano Zampini   PetscValidHeaderSpecific(B,MAT_CLASSID,2);
3918ed502f03SStefano Zampini   PetscValidPointer(C,4);
3919ed502f03SStefano Zampini   PetscCheckTypeName(A,MATSEQAIJCUSPARSE);
3920ed502f03SStefano Zampini   PetscCheckTypeName(B,MATSEQAIJCUSPARSE);
3921ed502f03SStefano Zampini   if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n);
3922ed502f03SStefano Zampini   if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported");
3923ed502f03SStefano Zampini   if (Acusp->format == MAT_CUSPARSE_ELL || Acusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3924ed502f03SStefano Zampini   if (Bcusp->format == MAT_CUSPARSE_ELL || Bcusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
3925ed502f03SStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
3926ed502f03SStefano Zampini     m     = A->rmap->n;
3927ed502f03SStefano Zampini     n     = A->cmap->n + B->cmap->n;
3928ed502f03SStefano Zampini     ierr  = MatCreate(PETSC_COMM_SELF,C);CHKERRQ(ierr);
3929ed502f03SStefano Zampini     ierr  = MatSetSizes(*C,m,n,m,n);CHKERRQ(ierr);
3930ed502f03SStefano Zampini     ierr  = MatSetType(*C,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3931ed502f03SStefano Zampini     c     = (Mat_SeqAIJ*)(*C)->data;
3932ed502f03SStefano Zampini     Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
3933ed502f03SStefano Zampini     Cmat  = new Mat_SeqAIJCUSPARSEMultStruct;
3934ed502f03SStefano Zampini     Ccsr  = new CsrMatrix;
3935ed502f03SStefano Zampini     Cmat->cprowIndices      = NULL;
3936ed502f03SStefano Zampini     c->compressedrow.use    = PETSC_FALSE;
3937ed502f03SStefano Zampini     c->compressedrow.nrows  = 0;
3938ed502f03SStefano Zampini     c->compressedrow.i      = NULL;
3939ed502f03SStefano Zampini     c->compressedrow.rindex = NULL;
3940ed502f03SStefano Zampini     Ccusp->workVector       = NULL;
3941ed502f03SStefano Zampini     Ccusp->nrows    = m;
3942ed502f03SStefano Zampini     Ccusp->mat      = Cmat;
3943ed502f03SStefano Zampini     Ccusp->mat->mat = Ccsr;
3944ed502f03SStefano Zampini     Ccsr->num_rows  = m;
3945ed502f03SStefano Zampini     Ccsr->num_cols  = n;
3946ed502f03SStefano Zampini     stat = cusparseCreateMatDescr(&Cmat->descr);CHKERRCUSPARSE(stat);
3947ed502f03SStefano Zampini     stat = cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
3948ed502f03SStefano Zampini     stat = cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
3949ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
3950ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
3951ed502f03SStefano Zampini     cerr = cudaMalloc((void **)&(Cmat->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
3952ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3953ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3954ed502f03SStefano Zampini     cerr = cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
3955ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
3956ed502f03SStefano Zampini     ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
39571a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(A);CHKERRQ(ierr);
39581a2c6b5cSJunchao Zhang     ierr = MatSeqAIJCUSPARSEFormExplicitTransposeForMult(B);CHKERRQ(ierr);
3959ed502f03SStefano Zampini     if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3960ed502f03SStefano Zampini     if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
3961ed502f03SStefano Zampini 
3962ed502f03SStefano Zampini     Acsr = (CsrMatrix*)Acusp->mat->mat;
3963ed502f03SStefano Zampini     Bcsr = (CsrMatrix*)Bcusp->mat->mat;
3964ed502f03SStefano Zampini     Annz = (PetscInt)Acsr->column_indices->size();
3965ed502f03SStefano Zampini     Bnnz = (PetscInt)Bcsr->column_indices->size();
3966ed502f03SStefano Zampini     c->nz = Annz + Bnnz;
3967ed502f03SStefano Zampini     Ccsr->row_offsets = new THRUSTINTARRAY32(m+1);
3968ed502f03SStefano Zampini     Ccsr->column_indices = new THRUSTINTARRAY32(c->nz);
3969ed502f03SStefano Zampini     Ccsr->values = new THRUSTARRAY(c->nz);
3970ed502f03SStefano Zampini     Ccsr->num_entries = c->nz;
3971ed502f03SStefano Zampini     Ccusp->cooPerm = new THRUSTINTARRAY(c->nz);
3972ed502f03SStefano Zampini     if (c->nz) {
39732ed87e7eSStefano Zampini       auto Acoo = new THRUSTINTARRAY32(Annz);
39742ed87e7eSStefano Zampini       auto Bcoo = new THRUSTINTARRAY32(Bnnz);
39752ed87e7eSStefano Zampini       auto Ccoo = new THRUSTINTARRAY32(c->nz);
39762ed87e7eSStefano Zampini       THRUSTINTARRAY32 *Aroff,*Broff;
39772ed87e7eSStefano Zampini 
3978ed502f03SStefano Zampini       if (a->compressedrow.use) { /* need full row offset */
3979ed502f03SStefano Zampini         if (!Acusp->rowoffsets_gpu) {
3980ed502f03SStefano Zampini           Acusp->rowoffsets_gpu  = new THRUSTINTARRAY32(A->rmap->n + 1);
3981ed502f03SStefano Zampini           Acusp->rowoffsets_gpu->assign(a->i,a->i + A->rmap->n + 1);
3982ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((A->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3983ed502f03SStefano Zampini         }
39842ed87e7eSStefano Zampini         Aroff = Acusp->rowoffsets_gpu;
39852ed87e7eSStefano Zampini       } else Aroff = Acsr->row_offsets;
3986ed502f03SStefano Zampini       if (b->compressedrow.use) { /* need full row offset */
3987ed502f03SStefano Zampini         if (!Bcusp->rowoffsets_gpu) {
3988ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu  = new THRUSTINTARRAY32(B->rmap->n + 1);
3989ed502f03SStefano Zampini           Bcusp->rowoffsets_gpu->assign(b->i,b->i + B->rmap->n + 1);
3990ed502f03SStefano Zampini           ierr = PetscLogCpuToGpu((B->rmap->n + 1)*sizeof(PetscInt));CHKERRQ(ierr);
3991ed502f03SStefano Zampini         }
39922ed87e7eSStefano Zampini         Broff = Bcusp->rowoffsets_gpu;
39932ed87e7eSStefano Zampini       } else Broff = Bcsr->row_offsets;
3994ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
39952ed87e7eSStefano Zampini       stat = cusparseXcsr2coo(Acusp->handle,
39962ed87e7eSStefano Zampini                               Aroff->data().get(),
39972ed87e7eSStefano Zampini                               Annz,
39982ed87e7eSStefano Zampini                               m,
39992ed87e7eSStefano Zampini                               Acoo->data().get(),
40002ed87e7eSStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4001ed502f03SStefano Zampini       stat = cusparseXcsr2coo(Bcusp->handle,
40022ed87e7eSStefano Zampini                               Broff->data().get(),
4003ed502f03SStefano Zampini                               Bnnz,
4004ed502f03SStefano Zampini                               m,
40052ed87e7eSStefano Zampini                               Bcoo->data().get(),
4006ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
40072ed87e7eSStefano Zampini       /* Issues when using bool with large matrices on SUMMIT 10.2.89 */
40082ed87e7eSStefano Zampini       auto Aperm = thrust::make_constant_iterator(1);
40092ed87e7eSStefano Zampini       auto Bperm = thrust::make_constant_iterator(0);
40108909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(10,0,0)
4011ed502f03SStefano Zampini       auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(),Shift(A->cmap->n));
4012ed502f03SStefano Zampini       auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(),Shift(A->cmap->n));
40138909a122SStefano Zampini #else
40148909a122SStefano Zampini       /* there are issues instantiating the merge operation using a transform iterator for the columns of B */
40158909a122SStefano Zampini       auto Bcib = Bcsr->column_indices->begin();
40168909a122SStefano Zampini       auto Bcie = Bcsr->column_indices->end();
40178909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(A->cmap->n));
40188909a122SStefano Zampini #endif
40192ed87e7eSStefano Zampini       auto wPerm = new THRUSTINTARRAY32(Annz+Bnnz);
40202ed87e7eSStefano Zampini       auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(),Acsr->column_indices->begin(),Acsr->values->begin(),Aperm));
40212ed87e7eSStefano Zampini       auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(),Acsr->column_indices->end(),Acsr->values->end(),Aperm));
40222ed87e7eSStefano Zampini       auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(),Bcib,Bcsr->values->begin(),Bperm));
40232ed87e7eSStefano Zampini       auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(),Bcie,Bcsr->values->end(),Bperm));
40242ed87e7eSStefano Zampini       auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(),Ccsr->column_indices->begin(),Ccsr->values->begin(),wPerm->begin()));
4025ed502f03SStefano Zampini       auto p1 = Ccusp->cooPerm->begin();
4026ed502f03SStefano Zampini       auto p2 = Ccusp->cooPerm->begin();
4027ed502f03SStefano Zampini       thrust::advance(p2,Annz);
40282ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::merge(thrust::device,Azb,Aze,Bzb,Bze,Czb,IJCompare4()));
40298909a122SStefano Zampini #if PETSC_PKG_CUDA_VERSION_LT(10,0,0)
40308909a122SStefano Zampini       thrust::transform(Bcib,Bcie,Bcib,Shift(-A->cmap->n));
40318909a122SStefano Zampini #endif
40322ed87e7eSStefano Zampini       auto cci = thrust::make_counting_iterator(zero);
40332ed87e7eSStefano Zampini       auto cce = thrust::make_counting_iterator(c->nz);
40342ed87e7eSStefano Zampini #if 0 //Errors on SUMMIT cuda 11.1.0
40352ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>()));
40362ed87e7eSStefano Zampini #else
40372ed87e7eSStefano Zampini       auto pred = thrust::identity<int>();
40382ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::copy_if(thrust::device,cci,cce,wPerm->begin(),p1,pred));
40392ed87e7eSStefano Zampini       PetscStackCallThrust(thrust::remove_copy_if(thrust::device,cci,cce,wPerm->begin(),p2,pred));
40402ed87e7eSStefano Zampini #endif
4041ed502f03SStefano Zampini       stat = cusparseXcoo2csr(Ccusp->handle,
40422ed87e7eSStefano Zampini                               Ccoo->data().get(),
4043ed502f03SStefano Zampini                               c->nz,
4044ed502f03SStefano Zampini                               m,
4045ed502f03SStefano Zampini                               Ccsr->row_offsets->data().get(),
4046ed502f03SStefano Zampini                               CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4047ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4048ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
40492ed87e7eSStefano Zampini       delete wPerm;
40502ed87e7eSStefano Zampini       delete Acoo;
40512ed87e7eSStefano Zampini       delete Bcoo;
40522ed87e7eSStefano Zampini       delete Ccoo;
4053ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4054ed502f03SStefano Zampini       stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries,
4055ed502f03SStefano Zampini                                Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(),
4056ed502f03SStefano Zampini                                CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4057ed502f03SStefano Zampini                                CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4058ed502f03SStefano Zampini #endif
40591a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */
4060ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4061ed502f03SStefano Zampini         Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct;
4062ed502f03SStefano Zampini         CsrMatrix *CcsrT = new CsrMatrix;
4063ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4064ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4065ed502f03SStefano Zampini 
40661a2c6b5cSJunchao Zhang         (*C)->form_explicit_transpose = PETSC_TRUE;
40671a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4068a49f1ed0SStefano Zampini         Ccusp->rowoffsets_gpu = NULL;
4069ed502f03SStefano Zampini         CmatT->cprowIndices = NULL;
4070ed502f03SStefano Zampini         CmatT->mat = CcsrT;
4071ed502f03SStefano Zampini         CcsrT->num_rows = n;
4072ed502f03SStefano Zampini         CcsrT->num_cols = m;
4073ed502f03SStefano Zampini         CcsrT->num_entries = c->nz;
4074ed502f03SStefano Zampini 
4075ed502f03SStefano Zampini         CcsrT->row_offsets = new THRUSTINTARRAY32(n+1);
4076ed502f03SStefano Zampini         CcsrT->column_indices = new THRUSTINTARRAY32(c->nz);
4077ed502f03SStefano Zampini         CcsrT->values = new THRUSTARRAY(c->nz);
4078ed502f03SStefano Zampini 
4079ed502f03SStefano Zampini         ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4080ed502f03SStefano Zampini         auto rT = CcsrT->row_offsets->begin();
4081ed502f03SStefano Zampini         if (AT) {
4082ed502f03SStefano Zampini           rT = thrust::copy(AcsrT->row_offsets->begin(),AcsrT->row_offsets->end(),rT);
4083ed502f03SStefano Zampini           thrust::advance(rT,-1);
4084ed502f03SStefano Zampini         }
4085ed502f03SStefano Zampini         if (BT) {
4086ed502f03SStefano Zampini           auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(),Shift(a->nz));
4087ed502f03SStefano Zampini           auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(),Shift(a->nz));
4088ed502f03SStefano Zampini           thrust::copy(titb,tite,rT);
4089ed502f03SStefano Zampini         }
4090ed502f03SStefano Zampini         auto cT = CcsrT->column_indices->begin();
4091ed502f03SStefano Zampini         if (AT) cT = thrust::copy(AcsrT->column_indices->begin(),AcsrT->column_indices->end(),cT);
4092ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->column_indices->begin(),BcsrT->column_indices->end(),cT);
4093ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4094ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4095ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
4096ed502f03SStefano Zampini         cerr = WaitForCUDA();CHKERRCUDA(cerr);
4097ed502f03SStefano Zampini         ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4098ed502f03SStefano Zampini 
4099ed502f03SStefano Zampini         stat = cusparseCreateMatDescr(&CmatT->descr);CHKERRCUSPARSE(stat);
4100ed502f03SStefano Zampini         stat = cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO);CHKERRCUSPARSE(stat);
4101ed502f03SStefano Zampini         stat = cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL);CHKERRCUSPARSE(stat);
4102ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->alpha_one),sizeof(PetscScalar));CHKERRCUDA(cerr);
4103ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_zero),sizeof(PetscScalar));CHKERRCUDA(cerr);
4104ed502f03SStefano Zampini         cerr = cudaMalloc((void **)&(CmatT->beta_one), sizeof(PetscScalar));CHKERRCUDA(cerr);
4105ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->alpha_one,&PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4106ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_zero,&PETSC_CUSPARSE_ZERO,sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4107ed502f03SStefano Zampini         cerr = cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
4108ed502f03SStefano Zampini #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4109ed502f03SStefano Zampini         stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries,
4110ed502f03SStefano Zampini                                  CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(),
4111ed502f03SStefano Zampini                                  CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I,
4112ed502f03SStefano Zampini                                  CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype);CHKERRCUSPARSE(stat);
4113ed502f03SStefano Zampini #endif
4114ed502f03SStefano Zampini         Ccusp->matTranspose = CmatT;
4115ed502f03SStefano Zampini       }
4116ed502f03SStefano Zampini     }
4117ed502f03SStefano Zampini 
4118ed502f03SStefano Zampini     c->singlemalloc = PETSC_FALSE;
4119ed502f03SStefano Zampini     c->free_a       = PETSC_TRUE;
4120ed502f03SStefano Zampini     c->free_ij      = PETSC_TRUE;
4121ed502f03SStefano Zampini     ierr = PetscMalloc1(m+1,&c->i);CHKERRQ(ierr);
4122ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->j);CHKERRQ(ierr);
4123ed502f03SStefano Zampini     if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64 bit conversion on the GPU and then copy to host (lazy) */
4124ed502f03SStefano Zampini       THRUSTINTARRAY ii(Ccsr->row_offsets->size());
4125ed502f03SStefano Zampini       THRUSTINTARRAY jj(Ccsr->column_indices->size());
4126ed502f03SStefano Zampini       ii   = *Ccsr->row_offsets;
4127ed502f03SStefano Zampini       jj   = *Ccsr->column_indices;
4128ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,ii.data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4129ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,jj.data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4130ed502f03SStefano Zampini     } else {
4131ed502f03SStefano Zampini       cerr = cudaMemcpy(c->i,Ccsr->row_offsets->data().get(),Ccsr->row_offsets->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4132ed502f03SStefano Zampini       cerr = cudaMemcpy(c->j,Ccsr->column_indices->data().get(),Ccsr->column_indices->size()*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4133ed502f03SStefano Zampini     }
4134ed502f03SStefano Zampini     ierr = PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size())*sizeof(PetscInt));CHKERRQ(ierr);
4135ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->ilen);CHKERRQ(ierr);
4136ed502f03SStefano Zampini     ierr = PetscMalloc1(m,&c->imax);CHKERRQ(ierr);
4137ed502f03SStefano Zampini     c->maxnz = c->nz;
4138ed502f03SStefano Zampini     c->nonzerorowcnt = 0;
4139ed502f03SStefano Zampini     c->rmax = 0;
4140ed502f03SStefano Zampini     for (i = 0; i < m; i++) {
4141ed502f03SStefano Zampini       const PetscInt nn = c->i[i+1] - c->i[i];
4142ed502f03SStefano Zampini       c->ilen[i] = c->imax[i] = nn;
4143ed502f03SStefano Zampini       c->nonzerorowcnt += (PetscInt)!!nn;
4144ed502f03SStefano Zampini       c->rmax = PetscMax(c->rmax,nn);
4145ed502f03SStefano Zampini     }
4146ed502f03SStefano Zampini     ierr = MatMarkDiagonal_SeqAIJ(*C);CHKERRQ(ierr);
4147ed502f03SStefano Zampini     ierr = PetscMalloc1(c->nz,&c->a);CHKERRQ(ierr);
4148ed502f03SStefano Zampini     (*C)->nonzerostate++;
4149ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->rmap);CHKERRQ(ierr);
4150ed502f03SStefano Zampini     ierr = PetscLayoutSetUp((*C)->cmap);CHKERRQ(ierr);
4151ed502f03SStefano Zampini     Ccusp->nonzerostate = (*C)->nonzerostate;
4152ed502f03SStefano Zampini     (*C)->preallocated  = PETSC_TRUE;
4153ed502f03SStefano Zampini   } else {
4154ed502f03SStefano Zampini     if ((*C)->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",(*C)->rmap->n,B->rmap->n);
4155ed502f03SStefano Zampini     c = (Mat_SeqAIJ*)(*C)->data;
4156ed502f03SStefano Zampini     if (c->nz) {
4157ed502f03SStefano Zampini       Ccusp = (Mat_SeqAIJCUSPARSE*)(*C)->spptr;
4158ed502f03SStefano Zampini       if (!Ccusp->cooPerm) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cooPerm");
4159ed502f03SStefano Zampini       if (Ccusp->format == MAT_CUSPARSE_ELL || Ccusp->format == MAT_CUSPARSE_HYB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not implemented");
4160ed502f03SStefano Zampini       if (Ccusp->nonzerostate != (*C)->nonzerostate) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Wrong nonzerostate");
4161ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
4162ed502f03SStefano Zampini       ierr = MatSeqAIJCUSPARSECopyToGPU(B);CHKERRQ(ierr);
4163ed502f03SStefano Zampini       if (!Acusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4164ed502f03SStefano Zampini       if (!Bcusp->mat) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Mat_SeqAIJCUSPARSEMultStruct");
4165ed502f03SStefano Zampini       Acsr = (CsrMatrix*)Acusp->mat->mat;
4166ed502f03SStefano Zampini       Bcsr = (CsrMatrix*)Bcusp->mat->mat;
4167ed502f03SStefano Zampini       Ccsr = (CsrMatrix*)Ccusp->mat->mat;
4168ed502f03SStefano Zampini       if (Acsr->num_entries != (PetscInt)Acsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"A nnz %D != %D",Acsr->num_entries,(PetscInt)Acsr->values->size());
4169ed502f03SStefano Zampini       if (Bcsr->num_entries != (PetscInt)Bcsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"B nnz %D != %D",Bcsr->num_entries,(PetscInt)Bcsr->values->size());
4170ed502f03SStefano Zampini       if (Ccsr->num_entries != (PetscInt)Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D",Ccsr->num_entries,(PetscInt)Ccsr->values->size());
4171ed502f03SStefano Zampini       if (Ccsr->num_entries != Acsr->num_entries + Bcsr->num_entries) SETERRQ3(PETSC_COMM_SELF,PETSC_ERR_COR,"C nnz %D != %D + %D",Ccsr->num_entries,Acsr->num_entries,Bcsr->num_entries);
4172ed502f03SStefano Zampini       if (Ccusp->cooPerm->size() != Ccsr->values->size()) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_COR,"permSize %D != %D",(PetscInt)Ccusp->cooPerm->size(),(PetscInt)Ccsr->values->size());
4173ed502f03SStefano Zampini       auto pmid = Ccusp->cooPerm->begin();
4174ed502f03SStefano Zampini       thrust::advance(pmid,Acsr->num_entries);
4175ed502f03SStefano Zampini       ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4176ed502f03SStefano Zampini       auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(),
4177ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->begin())));
4178ed502f03SStefano Zampini       auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(),
4179ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4180ed502f03SStefano Zampini       thrust::for_each(zibait,zieait,VecCUDAEquals());
4181ed502f03SStefano Zampini       auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(),
4182ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),pmid)));
4183ed502f03SStefano Zampini       auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(),
4184ed502f03SStefano Zampini                                                                  thrust::make_permutation_iterator(Ccsr->values->begin(),Ccusp->cooPerm->end())));
4185ed502f03SStefano Zampini       thrust::for_each(zibbit,ziebit,VecCUDAEquals());
4186a49f1ed0SStefano Zampini       ierr = MatSeqAIJCUSPARSEInvalidateTranspose(*C,PETSC_FALSE);CHKERRQ(ierr);
41871a2c6b5cSJunchao Zhang       if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) {
4188ed502f03SStefano Zampini         if (!Ccusp->matTranspose) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing transpose Mat_SeqAIJCUSPARSEMultStruct");
4189ed502f03SStefano Zampini         PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE;
4190ed502f03SStefano Zampini         CsrMatrix *AcsrT = AT ? (CsrMatrix*)Acusp->matTranspose->mat : NULL;
4191ed502f03SStefano Zampini         CsrMatrix *BcsrT = BT ? (CsrMatrix*)Bcusp->matTranspose->mat : NULL;
4192ed502f03SStefano Zampini         CsrMatrix *CcsrT = (CsrMatrix*)Ccusp->matTranspose->mat;
4193ed502f03SStefano Zampini         auto vT = CcsrT->values->begin();
4194ed502f03SStefano Zampini         if (AT) vT = thrust::copy(AcsrT->values->begin(),AcsrT->values->end(),vT);
4195ed502f03SStefano Zampini         if (BT) thrust::copy(BcsrT->values->begin(),BcsrT->values->end(),vT);
41961a2c6b5cSJunchao Zhang         (*C)->transupdated = PETSC_TRUE;
4197ed502f03SStefano Zampini       }
4198ed502f03SStefano Zampini       cerr = WaitForCUDA();CHKERRCUDA(cerr);
4199ed502f03SStefano Zampini       ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4200ed502f03SStefano Zampini     }
4201ed502f03SStefano Zampini   }
4202ed502f03SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)*C);CHKERRQ(ierr);
4203ed502f03SStefano Zampini   (*C)->assembled     = PETSC_TRUE;
4204ed502f03SStefano Zampini   (*C)->was_assembled = PETSC_FALSE;
4205ed502f03SStefano Zampini   (*C)->offloadmask   = PETSC_OFFLOAD_GPU;
4206ed502f03SStefano Zampini   PetscFunctionReturn(0);
4207ed502f03SStefano Zampini }
4208c215019aSStefano Zampini 
4209c215019aSStefano Zampini static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[])
4210c215019aSStefano Zampini {
4211c215019aSStefano Zampini   PetscErrorCode    ierr;
4212c215019aSStefano Zampini   bool              dmem;
4213c215019aSStefano Zampini   const PetscScalar *av;
4214c215019aSStefano Zampini   cudaError_t       cerr;
4215c215019aSStefano Zampini 
4216c215019aSStefano Zampini   PetscFunctionBegin;
4217c215019aSStefano Zampini   dmem = isCudaMem(v);
4218c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSEGetArrayRead(A,&av);CHKERRQ(ierr);
4219c215019aSStefano Zampini   if (n && idx) {
4220c215019aSStefano Zampini     THRUSTINTARRAY widx(n);
4221c215019aSStefano Zampini     widx.assign(idx,idx+n);
4222c215019aSStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4223c215019aSStefano Zampini 
4224c215019aSStefano Zampini     THRUSTARRAY *w = NULL;
4225c215019aSStefano Zampini     thrust::device_ptr<PetscScalar> dv;
4226c215019aSStefano Zampini     if (dmem) {
4227c215019aSStefano Zampini       dv = thrust::device_pointer_cast(v);
4228c215019aSStefano Zampini     } else {
4229c215019aSStefano Zampini       w = new THRUSTARRAY(n);
4230c215019aSStefano Zampini       dv = w->data();
4231c215019aSStefano Zampini     }
4232c215019aSStefano Zampini     thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av);
4233c215019aSStefano Zampini 
4234c215019aSStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.begin()),dv));
4235c215019aSStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav,widx.end()),dv+n));
4236c215019aSStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4237c215019aSStefano Zampini     if (w) {
4238c215019aSStefano Zampini       cerr = cudaMemcpy(v,w->data().get(),n*sizeof(PetscScalar),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4239c215019aSStefano Zampini     }
4240c215019aSStefano Zampini     delete w;
4241c215019aSStefano Zampini   } else {
4242c215019aSStefano Zampini     cerr = cudaMemcpy(v,av,n*sizeof(PetscScalar),dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
4243c215019aSStefano Zampini   }
4244c215019aSStefano Zampini   if (!dmem) { ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); }
4245c215019aSStefano Zampini   ierr = MatSeqAIJCUSPARSERestoreArrayRead(A,&av);CHKERRQ(ierr);
4246c215019aSStefano Zampini   PetscFunctionReturn(0);
4247c215019aSStefano Zampini }
4248bddcd29dSMark Adams 
4249bddcd29dSMark Adams /*
4250bddcd29dSMark Adams   LU BAND factorization with optimization for block diagonal (Nf blocks) in natural order (-mat_no_inode -pc_factor_mat_ordering_type rcm with Nf>1 fields)
4251bddcd29dSMark Adams 
4252bddcd29dSMark Adams   requires:
4253bddcd29dSMark Adams      structurally symmetric: fix with transpose/column meta data
4254bddcd29dSMark Adams */
4255bddcd29dSMark Adams 
4256bddcd29dSMark Adams /*
4257bddcd29dSMark Adams   The GPU LU factor kernel
4258bddcd29dSMark Adams */
4259bddcd29dSMark Adams __global__
4260bddcd29dSMark Adams void __launch_bounds__(1024,1)
4261bddcd29dSMark Adams mat_lu_factor_band_init_set_i(const PetscInt n, const int bw, int bi_csr[])
4262bddcd29dSMark Adams {
4263bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4264bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4265bddcd29dSMark Adams   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4266bddcd29dSMark Adams 
4267bddcd29dSMark Adams   // set i (row+1)
4268bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) bi_csr[0] = 0; // dummy at zero
4269bddcd29dSMark Adams   // for (int rowb = start_i + blkIdx*blockDim.y + threadIdx.y; rowb < end_i; rowb += Nblk*blockDim.y) { // rows in block
4270bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4271bddcd29dSMark Adams     if (rowb < end_i && threadIdx.x==0) {
4272bddcd29dSMark Adams       PetscInt i=rowb+1, ni = (rowb>bw) ? bw+1 : i, n1L = ni*(ni-1)/2, nug= i*bw, n2L = bw*((rowb>bw) ? (rowb-bw) : 0), mi = bw + rowb + 1 - n, clip = (mi>0) ? mi*(mi-1)/2 + mi: 0;
4273bddcd29dSMark Adams       bi_csr[rowb+1] = n1L + nug - clip + n2L + i;
4274bddcd29dSMark Adams     }
4275bddcd29dSMark Adams   }
4276bddcd29dSMark Adams }
4277bddcd29dSMark Adams // copy AIJ to AIJ_BAND
4278bddcd29dSMark Adams __global__
4279bddcd29dSMark Adams void __launch_bounds__(1024,1)
4280bddcd29dSMark Adams mat_lu_factor_band_copy_aij_aij(const PetscInt n, const int bw, const PetscInt r[], const PetscInt ic[],
4281bddcd29dSMark Adams                                 const int ai_d[], const int aj_d[], const PetscScalar aa_d[],
4282bddcd29dSMark Adams                                 const int bi_csr[], PetscScalar ba_csr[])
4283bddcd29dSMark Adams {
4284bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4285bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4286bddcd29dSMark Adams   const PetscInt  nloc_i =  (nloc/Nblk + !!(nloc%Nblk)), start_i = field*nloc + blkIdx*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i);
4287bddcd29dSMark Adams 
4288bddcd29dSMark Adams   // zero B
4289bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0) ba_csr[bi_csr[n]] = 0; // flop count at end
4290bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4291bddcd29dSMark Adams     if (rowb < end_i) {
4292bddcd29dSMark Adams       PetscScalar    *batmp = ba_csr + bi_csr[rowb];
4293bddcd29dSMark Adams       const PetscInt nzb = bi_csr[rowb+1] - bi_csr[rowb];
4294bddcd29dSMark Adams       for (int j=threadIdx.x ; j<nzb ; j += blockDim.x) {
4295bddcd29dSMark Adams         if (j<nzb) {
4296bddcd29dSMark Adams           batmp[j] = 0;
4297bddcd29dSMark Adams         }
4298bddcd29dSMark Adams       }
4299bddcd29dSMark Adams     }
4300bddcd29dSMark Adams   }
4301bddcd29dSMark Adams 
4302bddcd29dSMark Adams   // copy A into B with CSR format -- these two loops can be fused
4303bddcd29dSMark Adams   for (int rowb = start_i + threadIdx.y; rowb < end_i; rowb += blockDim.y) { // rows in block by thread y
4304bddcd29dSMark Adams     if (rowb < end_i) {
4305bddcd29dSMark Adams       const PetscInt    rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa];
4306bddcd29dSMark Adams       const int         *ajtmp = aj_d + ai_d[rowa], bjStart = (rowb>bw) ? rowb-bw : 0;
4307bddcd29dSMark Adams       const PetscScalar *av    = aa_d + ai_d[rowa];
4308bddcd29dSMark Adams       PetscScalar       *batmp = ba_csr + bi_csr[rowb];
4309bddcd29dSMark Adams       /* load in initial (unfactored row) */
4310bddcd29dSMark Adams       for (int j=threadIdx.x ; j<nza ; j += blockDim.x) {
4311bddcd29dSMark Adams         if (j<nza) {
4312bddcd29dSMark Adams           PetscInt    colb = ic[ajtmp[j]], idx = colb - bjStart;
4313bddcd29dSMark Adams           PetscScalar vala = av[j];
4314bddcd29dSMark Adams           batmp[idx] = vala;
4315bddcd29dSMark Adams         }
4316bddcd29dSMark Adams       }
4317bddcd29dSMark Adams     }
4318bddcd29dSMark Adams   }
4319bddcd29dSMark Adams }
4320bddcd29dSMark Adams // print AIJ_BAND
4321bddcd29dSMark Adams __global__
4322bddcd29dSMark Adams void print_mat_aij_band(const PetscInt n, const int bi_csr[], const PetscScalar ba_csr[])
4323bddcd29dSMark Adams {
4324bddcd29dSMark Adams   // debug
4325bddcd29dSMark Adams   if (threadIdx.x + threadIdx.y + blockIdx.x + blockIdx.y == 0){
4326bddcd29dSMark Adams     printf("B (AIJ) n=%d:\n",(int)n);
4327bddcd29dSMark Adams     for (int rowb=0;rowb<n;rowb++) {
4328bddcd29dSMark Adams       const PetscInt    nz = bi_csr[rowb+1] - bi_csr[rowb];
4329bddcd29dSMark Adams       const PetscScalar *batmp = ba_csr + bi_csr[rowb];
4330bddcd29dSMark Adams       for (int j=0; j<nz; j++) printf("(%13.6e) ",PetscRealPart(batmp[j]));
4331bddcd29dSMark Adams       printf(" bi=%d\n",bi_csr[rowb+1]);
4332bddcd29dSMark Adams     }
4333bddcd29dSMark Adams   }
4334bddcd29dSMark Adams }
4335bddcd29dSMark Adams // Band LU kernel ---  ba_csr bi_csr
4336bddcd29dSMark Adams __global__
4337bddcd29dSMark Adams void __launch_bounds__(1024,1)
4338bddcd29dSMark Adams mat_lu_factor_band(const PetscInt n, const PetscInt bw, const int bi_csr[], PetscScalar ba_csr[])
4339bddcd29dSMark Adams {
4340bddcd29dSMark Adams   extern __shared__ PetscInt smemInt[];
4341bddcd29dSMark Adams   PetscInt        *sm_pkIdx  = &smemInt[0];
4342bddcd29dSMark Adams   const PetscInt  Nf = gridDim.x, Nblk = gridDim.y, nloc = n/Nf;
4343bddcd29dSMark Adams   const PetscInt  field = blockIdx.x, blkIdx = blockIdx.y;
4344bddcd29dSMark Adams   const PetscInt  start = field*nloc, end = start + nloc;
4345bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4346bddcd29dSMark Adams   auto g = cooperative_groups::this_grid();
4347bddcd29dSMark Adams #endif
4348bddcd29dSMark Adams   // A22 panel update for each row A(1,:) and col A(:,1)
4349bddcd29dSMark Adams   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4350bddcd29dSMark Adams     PetscInt          tnzUd = bw, maxU = end-1 - glbDD; // we are chopping off the inter ears
4351bddcd29dSMark Adams     const PetscInt    nzUd  = (tnzUd>maxU) ? maxU : tnzUd, dOffset = (glbDD > bw) ? bw : glbDD; // global to go past ears after first
4352bddcd29dSMark Adams     const PetscInt    nzUd_pad = blockDim.y*(nzUd/blockDim.y + !!(nzUd%blockDim.y));
4353bddcd29dSMark Adams     PetscScalar       *pBdd = ba_csr + bi_csr[glbDD] + dOffset;
4354bddcd29dSMark Adams     const PetscScalar *baUd = pBdd + 1; // vector of data  U(i,i+1:end)
4355bddcd29dSMark Adams     const PetscScalar Bdd = *pBdd;
4356bddcd29dSMark Adams     const PetscInt offset = blkIdx*blockDim.y + threadIdx.y, inc = Nblk*blockDim.y;
4357bddcd29dSMark Adams     for (int idx = offset, myi = glbDD + offset + 1; idx < nzUd_pad ; idx += inc, myi += inc) { /* assuming symmetric structure */
4358bddcd29dSMark Adams       if (idx < nzUd && threadIdx.x==0) { /* assuming symmetric structure */
4359bddcd29dSMark Adams         const PetscInt bwi = myi > bw ? bw : myi, kIdx = bwi - (myi-glbDD); // cuts off just the first (global) block
4360bddcd29dSMark Adams         PetscScalar    *Aid = ba_csr + bi_csr[myi] + kIdx;
4361bddcd29dSMark Adams         *Aid = *Aid/Bdd;
4362bddcd29dSMark Adams         sm_pkIdx[threadIdx.y] = kIdx;
4363bddcd29dSMark Adams       }
4364bddcd29dSMark Adams       __syncthreads(); // synch on threadIdx.x only
4365bddcd29dSMark Adams       if (idx < nzUd) { /* assuming symmetric structure */
4366bddcd29dSMark Adams         PetscInt    kIdx = sm_pkIdx[threadIdx.y];
4367bddcd29dSMark Adams         PetscScalar *Aid = ba_csr + bi_csr[myi] + kIdx;
4368bddcd29dSMark Adams         PetscScalar *Aij =  Aid + 1;
4369bddcd29dSMark Adams         PetscScalar Lid  = *Aid;
4370bddcd29dSMark Adams         for (int jIdx=threadIdx.x ; jIdx<nzUd ; jIdx += blockDim.x) {
4371bddcd29dSMark Adams           if (jIdx<nzUd) {
4372bddcd29dSMark Adams             Aij[jIdx] -= Lid*baUd[jIdx];
4373bddcd29dSMark Adams           }
4374bddcd29dSMark Adams         }
4375bddcd29dSMark Adams       }
4376bddcd29dSMark Adams     }
4377bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4378bddcd29dSMark Adams     g.sync();
4379bddcd29dSMark Adams #else
4380bddcd29dSMark Adams     __syncthreads();
4381bddcd29dSMark Adams #endif
4382bddcd29dSMark Adams   } /* endof for (i=0; i<n; i++) { */
4383bddcd29dSMark Adams }
4384bddcd29dSMark Adams 
4385bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat,Vec,Vec);
4386bddcd29dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSEBAND(Mat B,Mat A,const MatFactorInfo *info)
4387bddcd29dSMark Adams {
4388bddcd29dSMark Adams   Mat_SeqAIJ                   *b = (Mat_SeqAIJ*)B->data;
4389bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4390bddcd29dSMark Adams   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparseTriFactors");
4391bddcd29dSMark Adams   Mat_SeqAIJCUSPARSE           *cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
4392bddcd29dSMark Adams   Mat_SeqAIJCUSPARSEMultStruct *matstructA;
4393bddcd29dSMark Adams   CsrMatrix                    *matrixA;
4394bddcd29dSMark Adams   PetscErrorCode               ierr;
4395bddcd29dSMark Adams   cudaError_t                  cerr;
4396bddcd29dSMark Adams   const PetscInt               n=A->rmap->n, *ic, *r;
4397bddcd29dSMark Adams   const int                    *ai_d, *aj_d;
4398bddcd29dSMark Adams   const PetscScalar            *aa_d;
4399bddcd29dSMark Adams   PetscScalar                  *ba_t = cusparseTriFactors->a_band_d;
4400bddcd29dSMark Adams   int                          *bi_t = cusparseTriFactors->i_band_d;
4401bddcd29dSMark Adams   PetscContainer               container;
4402bddcd29dSMark Adams   int                          Ni = 10, team_size=9, Nf, nVec=56, nconcurrent = 1, nsm = -1;
4403bddcd29dSMark Adams 
4404bddcd29dSMark Adams   PetscFunctionBegin;
4405bddcd29dSMark Adams   if (A->rmap->n == 0) {
4406bddcd29dSMark Adams     PetscFunctionReturn(0);
4407bddcd29dSMark Adams   }
4408bddcd29dSMark Adams   // cusparse setup
4409bddcd29dSMark Adams   if (!cusparsestructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing cusparsestructA");
4410bddcd29dSMark Adams   matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; //  matstruct->cprowIndices
4411bddcd29dSMark Adams   if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing mat struct");
4412bddcd29dSMark Adams   matrixA = (CsrMatrix*)matstructA->mat;
4413bddcd29dSMark Adams   if (!matrixA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing matrix cusparsestructA->mat->mat");
4414bddcd29dSMark Adams 
4415bddcd29dSMark Adams   // factor: get Nf if available
4416bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4417bddcd29dSMark Adams   if (container) {
4418bddcd29dSMark Adams     PetscInt *pNf=NULL;
4419bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4420bddcd29dSMark Adams     Nf = (*pNf)%1000;
4421bddcd29dSMark Adams     if ((*pNf)/1000>0) nconcurrent = (*pNf)/1000; // number of SMs to use
4422bddcd29dSMark Adams   } else Nf = 1;
4423bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4424bddcd29dSMark Adams 
4425bddcd29dSMark Adams   // get data
4426bddcd29dSMark Adams   ic      = thrust::raw_pointer_cast(cusparseTriFactors->cpermIndices->data());
4427bddcd29dSMark Adams   ai_d    = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4428bddcd29dSMark Adams   aj_d    = thrust::raw_pointer_cast(matrixA->column_indices->data());
4429bddcd29dSMark Adams   aa_d    = thrust::raw_pointer_cast(matrixA->values->data().get());
4430bddcd29dSMark Adams   r       = thrust::raw_pointer_cast(cusparseTriFactors->rpermIndices->data());
4431bddcd29dSMark Adams 
4432bddcd29dSMark Adams   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4433bddcd29dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4434bddcd29dSMark Adams   {
4435bddcd29dSMark Adams     int bw = (2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-b->nz))+PETSC_MACHINE_EPSILON))/2, bm1=bw-1,nl=n/Nf;
4436bddcd29dSMark Adams     int gpuid;
4437bddcd29dSMark Adams     cudaDeviceProp prop;
4438bddcd29dSMark Adams     cudaGetDevice(&gpuid);
4439bddcd29dSMark Adams     cudaGetDeviceProperties(&prop, gpuid);
4440bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_LT(11,0,0)
4441bddcd29dSMark Adams     Ni = 1/nconcurrent;
4442bddcd29dSMark Adams     Ni = 1;
4443bddcd29dSMark Adams #else
4444bddcd29dSMark Adams     nsm = prop.multiProcessorCount;
4445bddcd29dSMark Adams     Ni = nsm/Nf/nconcurrent;
4446bddcd29dSMark Adams #endif
4447bddcd29dSMark Adams     team_size = bw/Ni + !!(bw%Ni);
4448bddcd29dSMark Adams     nVec = PetscMin(bw, 1024/team_size);
4449bddcd29dSMark Adams     ierr = PetscInfo5(A,"Matrix Bandwidth = %d, number SMs/block = %d, num concurency = %d, num fields = %d, numSMs/GPU = %d\n",bw,Ni,nconcurrent,Nf,nsm);CHKERRQ(ierr);
4450bddcd29dSMark Adams     {
4451bddcd29dSMark Adams       dim3 dimBlockTeam(nVec,team_size);
4452bddcd29dSMark Adams       dim3 dimBlockLeague(Nf,Ni);
4453bddcd29dSMark Adams       mat_lu_factor_band_copy_aij_aij<<<dimBlockLeague,dimBlockTeam>>>(n, bw, r, ic, ai_d, aj_d, aa_d, bi_t, ba_t);
4454bddcd29dSMark Adams       CHECK_LAUNCH_ERROR(); // does a sync
4455bddcd29dSMark Adams #if PETSC_PKG_CUDA_VERSION_GE(11,0,0)
4456bddcd29dSMark Adams       void *kernelArgs[] = { (void*)&n, (void*)&bw, (void*)&bi_t, (void*)&ba_t};
4457bddcd29dSMark Adams       cudaLaunchCooperativeKernel((void*)mat_lu_factor_band, dimBlockLeague, dimBlockTeam, kernelArgs, team_size*sizeof(PetscInt), NULL);
4458bddcd29dSMark Adams #else
4459bddcd29dSMark Adams       mat_lu_factor_band<<<dimBlockLeague,dimBlockTeam,team_size*sizeof(PetscInt)>>>(n, bw, bi_t, ba_t);
4460bddcd29dSMark Adams #endif
4461bddcd29dSMark Adams       CHECK_LAUNCH_ERROR(); // does a sync
4462bddcd29dSMark Adams #if defined(PETSC_USE_LOG)
4463bddcd29dSMark Adams       ierr = PetscLogGpuFlops((PetscLogDouble)Nf*(bm1*(bm1 + 1)*(2*bm1 + 1)/3 + 2*(nl-bw)*bw*bw + nl*(nl+1)/2));CHKERRQ(ierr);
4464bddcd29dSMark Adams #endif
4465bddcd29dSMark Adams     }
4466bddcd29dSMark Adams   }
4467bddcd29dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4468bddcd29dSMark Adams 
4469bddcd29dSMark Adams   /* determine which version of MatSolve needs to be used. from MatLUFactorNumeric_AIJ_SeqAIJCUSPARSE */
4470bddcd29dSMark Adams   B->ops->solve = MatSolve_SeqAIJCUSPARSEBAND;
4471bddcd29dSMark Adams   B->ops->solvetranspose = NULL; // need transpose
4472bddcd29dSMark Adams   B->ops->matsolve = NULL;
4473bddcd29dSMark Adams   B->ops->matsolvetranspose = NULL;
4474bddcd29dSMark Adams 
4475bddcd29dSMark Adams   PetscFunctionReturn(0);
4476bddcd29dSMark Adams }
4477bddcd29dSMark Adams 
4478bddcd29dSMark Adams static PetscErrorCode MatrixNfDestroy(void *ptr)
4479bddcd29dSMark Adams {
4480bddcd29dSMark Adams   PetscInt *nf = (PetscInt *)ptr;
4481bddcd29dSMark Adams   PetscErrorCode  ierr;
4482bddcd29dSMark Adams   PetscFunctionBegin;
4483bddcd29dSMark Adams   ierr = PetscFree(nf);CHKERRQ(ierr);
4484bddcd29dSMark Adams   PetscFunctionReturn(0);
4485bddcd29dSMark Adams }
4486bddcd29dSMark Adams 
4487bddcd29dSMark Adams PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSEBAND(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
4488bddcd29dSMark Adams {
4489bddcd29dSMark Adams   Mat_SeqAIJ         *a = (Mat_SeqAIJ*)A->data,*b;
4490bddcd29dSMark Adams   IS                 isicol;
4491bddcd29dSMark Adams   PetscErrorCode     ierr;
4492bddcd29dSMark Adams   cudaError_t        cerr;
4493bddcd29dSMark Adams   const PetscInt     *ic,*ai=a->i,*aj=a->j;
4494bddcd29dSMark Adams   PetscScalar        *ba_t;
4495bddcd29dSMark Adams   int                *bi_t;
4496bddcd29dSMark Adams   PetscInt           i,n=A->rmap->n,Nf;
4497bddcd29dSMark Adams   PetscInt           nzBcsr,bwL,bwU;
4498bddcd29dSMark Adams   PetscBool          missing;
4499bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)B->spptr;
4500bddcd29dSMark Adams   PetscContainer               container;
4501bddcd29dSMark Adams 
4502bddcd29dSMark Adams   PetscFunctionBegin;
4503bddcd29dSMark Adams   if (A->rmap->N != A->cmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"matrix must be square");
4504bddcd29dSMark Adams   ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr);
4505bddcd29dSMark Adams   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i);
4506bddcd29dSMark Adams   if (!cusparseTriFactors) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"!cusparseTriFactors");
4507bddcd29dSMark Adams   ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&missing);CHKERRQ(ierr);
4508bddcd29dSMark Adams   if (!missing) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"only structrally symmetric matrices supported");
4509bddcd29dSMark Adams 
4510bddcd29dSMark Adams    // factor: get Nf if available
4511bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4512bddcd29dSMark Adams   if (container) {
4513bddcd29dSMark Adams     PetscInt *pNf=NULL;
4514bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4515bddcd29dSMark Adams     Nf = (*pNf)%1000;
4516bddcd29dSMark Adams     ierr = PetscContainerCreate(PETSC_COMM_SELF, &container);CHKERRQ(ierr);
4517bddcd29dSMark Adams     ierr = PetscMalloc(sizeof(PetscInt), &pNf);CHKERRQ(ierr);
4518bddcd29dSMark Adams     *pNf = Nf;
4519bddcd29dSMark Adams     ierr = PetscContainerSetPointer(container, (void *)pNf);CHKERRQ(ierr);
4520bddcd29dSMark Adams     ierr = PetscContainerSetUserDestroy(container, MatrixNfDestroy);CHKERRQ(ierr);
4521bddcd29dSMark Adams     ierr = PetscObjectCompose((PetscObject)B, "Nf", (PetscObject) container);CHKERRQ(ierr);
4522bddcd29dSMark Adams     ierr = PetscContainerDestroy(&container);CHKERRQ(ierr);
4523bddcd29dSMark Adams   } else Nf = 1;
4524bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4525bddcd29dSMark Adams 
4526bddcd29dSMark Adams   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
4527bddcd29dSMark Adams   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4528bddcd29dSMark Adams 
4529bddcd29dSMark Adams   ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4530bddcd29dSMark Adams   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)isicol);CHKERRQ(ierr);
4531bddcd29dSMark Adams   b    = (Mat_SeqAIJ*)(B)->data;
4532bddcd29dSMark Adams 
4533bddcd29dSMark Adams   /* get band widths, MatComputeBandwidth should take a reordering ic and do this */
4534bddcd29dSMark Adams   bwL = bwU = 0;
4535bddcd29dSMark Adams   for (int rwb=0; rwb<n; rwb++) {
4536bddcd29dSMark Adams     const PetscInt rwa = ic[rwb], anz = ai[rwb+1] - ai[rwb], *ajtmp = aj + ai[rwb];
4537bddcd29dSMark Adams     for (int j=0;j<anz;j++) {
4538bddcd29dSMark Adams       PetscInt colb = ic[ajtmp[j]];
4539bddcd29dSMark Adams       if (colb<rwa) { // L
4540bddcd29dSMark Adams         if (rwa-colb > bwL) bwL = rwa-colb;
4541bddcd29dSMark Adams       } else {
4542bddcd29dSMark Adams         if (colb-rwa > bwU) bwU = colb-rwa;
4543bddcd29dSMark Adams       }
4544bddcd29dSMark Adams     }
4545bddcd29dSMark Adams   }
4546bddcd29dSMark Adams   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4547bddcd29dSMark Adams   /* only support structurally symmetric, but it might work */
4548bddcd29dSMark Adams   if (bwL!=bwU) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Only symmetric structure supported (now) W_L=%D W_U=%D",bwL,bwU);
4549bddcd29dSMark Adams   ierr = MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors);CHKERRQ(ierr);
4550bddcd29dSMark Adams   nzBcsr = n + (2*n-1)*bwU - bwU*bwU;
4551bddcd29dSMark Adams   b->maxnz = b->nz = nzBcsr;
4552bddcd29dSMark Adams   cusparseTriFactors->nnz = b->nz; // only meta data needed: n & nz
4553bddcd29dSMark Adams   if (!cusparseTriFactors->workVector) { cusparseTriFactors->workVector = new THRUSTARRAY(n); }
4554bddcd29dSMark Adams   cerr = cudaMalloc(&ba_t,(b->nz+1)*sizeof(PetscScalar));CHKERRCUDA(cerr); // incude a place for flops
4555bddcd29dSMark Adams   cerr = cudaMalloc(&bi_t,(n+1)*sizeof(int));CHKERRCUDA(cerr);
4556bddcd29dSMark Adams   cusparseTriFactors->a_band_d = ba_t;
4557bddcd29dSMark Adams   cusparseTriFactors->i_band_d = bi_t;
4558bddcd29dSMark Adams   /* In b structure:  Free imax, ilen, old a, old j.  Allocate solve_work, new a, new j */
4559bddcd29dSMark Adams   ierr = PetscLogObjectMemory((PetscObject)B,(nzBcsr+1)*(sizeof(PetscInt)+sizeof(PetscScalar)));CHKERRQ(ierr);
4560bddcd29dSMark Adams   {
4561bddcd29dSMark Adams     dim3 dimBlockTeam(1,128);
4562bddcd29dSMark Adams     dim3 dimBlockLeague(Nf,1);
4563bddcd29dSMark Adams     mat_lu_factor_band_init_set_i<<<dimBlockLeague,dimBlockTeam>>>(n, bwU, bi_t);
4564bddcd29dSMark Adams   }
4565bddcd29dSMark Adams   CHECK_LAUNCH_ERROR(); // does a sync
4566bddcd29dSMark Adams 
4567bddcd29dSMark Adams   // setup data
4568bddcd29dSMark Adams   if (!cusparseTriFactors->rpermIndices) {
4569bddcd29dSMark Adams     const PetscInt *r;
4570bddcd29dSMark Adams 
4571bddcd29dSMark Adams     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
4572bddcd29dSMark Adams     cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n);
4573bddcd29dSMark Adams     cusparseTriFactors->rpermIndices->assign(r, r+n);
4574bddcd29dSMark Adams     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
4575bddcd29dSMark Adams     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4576bddcd29dSMark Adams   }
4577bddcd29dSMark Adams   /* upper triangular indices */
4578bddcd29dSMark Adams   if (!cusparseTriFactors->cpermIndices) {
4579bddcd29dSMark Adams     const PetscInt *c;
4580bddcd29dSMark Adams 
4581bddcd29dSMark Adams     ierr = ISGetIndices(isicol,&c);CHKERRQ(ierr);
4582bddcd29dSMark Adams     cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n);
4583bddcd29dSMark Adams     cusparseTriFactors->cpermIndices->assign(c, c+n);
4584bddcd29dSMark Adams     ierr = ISRestoreIndices(isicol,&c);CHKERRQ(ierr);
4585bddcd29dSMark Adams     ierr = PetscLogCpuToGpu(n*sizeof(PetscInt));CHKERRQ(ierr);
4586bddcd29dSMark Adams   }
4587bddcd29dSMark Adams 
4588bddcd29dSMark Adams   /* put together the new matrix */
4589bddcd29dSMark Adams   b->free_a       = PETSC_FALSE;
4590bddcd29dSMark Adams   b->free_ij      = PETSC_FALSE;
4591bddcd29dSMark Adams   b->singlemalloc = PETSC_FALSE;
4592bddcd29dSMark Adams   b->ilen = NULL;
4593bddcd29dSMark Adams   b->imax = NULL;
4594bddcd29dSMark Adams   b->row  = isrow;
4595bddcd29dSMark Adams   b->col  = iscol;
4596bddcd29dSMark Adams   ierr    = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4597bddcd29dSMark Adams   ierr    = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4598bddcd29dSMark Adams   b->icol = isicol;
4599bddcd29dSMark Adams   ierr    = PetscMalloc1(n+1,&b->solve_work);CHKERRQ(ierr);
4600bddcd29dSMark Adams 
4601bddcd29dSMark Adams   B->factortype            = MAT_FACTOR_LU;
4602bddcd29dSMark Adams   B->info.factor_mallocs   = 0;
4603bddcd29dSMark Adams   B->info.fill_ratio_given = 0;
4604bddcd29dSMark Adams 
4605bddcd29dSMark Adams   if (ai[n]) {
4606bddcd29dSMark Adams     B->info.fill_ratio_needed = ((PetscReal)(nzBcsr))/((PetscReal)ai[n]);
4607bddcd29dSMark Adams   } else {
4608bddcd29dSMark Adams     B->info.fill_ratio_needed = 0.0;
4609bddcd29dSMark Adams   }
4610bddcd29dSMark Adams #if defined(PETSC_USE_INFO)
4611bddcd29dSMark Adams   if (ai[n] != 0) {
4612bddcd29dSMark Adams     PetscReal af = B->info.fill_ratio_needed;
4613bddcd29dSMark Adams     ierr = PetscInfo1(A,"Band fill ratio %g\n",(double)af);CHKERRQ(ierr);
4614bddcd29dSMark Adams   } else {
4615bddcd29dSMark Adams     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
4616bddcd29dSMark Adams   }
4617bddcd29dSMark Adams #endif
4618bddcd29dSMark Adams   if (a->inode.size) {
4619bddcd29dSMark Adams     ierr = PetscInfo(A,"Warning: using inodes in band solver.\n");CHKERRQ(ierr);
4620bddcd29dSMark Adams   }
4621bddcd29dSMark Adams   ierr = MatSeqAIJCheckInode_FactorLU(B);CHKERRQ(ierr);
4622bddcd29dSMark Adams   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSEBAND;
4623bddcd29dSMark Adams   B->offloadmask = PETSC_OFFLOAD_GPU;
4624bddcd29dSMark Adams 
4625bddcd29dSMark Adams   PetscFunctionReturn(0);
4626bddcd29dSMark Adams }
4627bddcd29dSMark Adams 
4628bddcd29dSMark Adams /* Use -pc_factor_mat_solver_type cusparseband */
4629bddcd29dSMark Adams PetscErrorCode MatFactorGetSolverType_seqaij_cusparse_band(Mat A,MatSolverType *type)
4630bddcd29dSMark Adams {
4631bddcd29dSMark Adams   PetscFunctionBegin;
4632bddcd29dSMark Adams   *type = MATSOLVERCUSPARSEBAND;
4633bddcd29dSMark Adams   PetscFunctionReturn(0);
4634bddcd29dSMark Adams }
4635bddcd29dSMark Adams 
4636bddcd29dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse_band(Mat A,MatFactorType ftype,Mat *B)
4637bddcd29dSMark Adams {
4638bddcd29dSMark Adams   PetscErrorCode ierr;
4639bddcd29dSMark Adams   PetscInt       n = A->rmap->n;
4640bddcd29dSMark Adams 
4641bddcd29dSMark Adams   PetscFunctionBegin;
4642bddcd29dSMark Adams   ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr);
4643bddcd29dSMark Adams   ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr);
4644bddcd29dSMark Adams   (*B)->factortype = ftype;
4645f73b0415SBarry Smith   (*B)->canuseordering = PETSC_TRUE;
4646bddcd29dSMark Adams   ierr = MatSetType(*B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4647bddcd29dSMark Adams 
4648bddcd29dSMark Adams   if (ftype == MAT_FACTOR_LU) {
4649bddcd29dSMark Adams     ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr);
4650bddcd29dSMark Adams     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE;
4651bddcd29dSMark Adams     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJCUSPARSEBAND;
4652bddcd29dSMark Adams   } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for CUSPARSEBAND Matrix Types");
4653bddcd29dSMark Adams 
4654bddcd29dSMark Adams   ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
4655bddcd29dSMark Adams   ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_cusparse_band);CHKERRQ(ierr);
4656bddcd29dSMark Adams   PetscFunctionReturn(0);
4657bddcd29dSMark Adams }
4658bddcd29dSMark Adams 
4659bddcd29dSMark Adams #define WARP_SIZE 32
4660bddcd29dSMark Adams template <typename T>
4661bddcd29dSMark Adams __forceinline__ __device__
4662bddcd29dSMark Adams T wreduce(T a)
4663bddcd29dSMark Adams {
4664bddcd29dSMark Adams   T b;
4665bddcd29dSMark Adams   #pragma unroll
4666bddcd29dSMark Adams   for (int i = WARP_SIZE/2; i >= 1; i = i >> 1) {
4667bddcd29dSMark Adams     b = __shfl_down_sync(0xffffffff, a, i);
4668bddcd29dSMark Adams     a += b;
4669bddcd29dSMark Adams   }
4670bddcd29dSMark Adams   return a;
4671bddcd29dSMark Adams }
4672bddcd29dSMark Adams // reduce in a block, returns result in thread 0
4673bddcd29dSMark Adams template <typename T, int BLOCK_SIZE>
4674bddcd29dSMark Adams __device__
4675bddcd29dSMark Adams T breduce(T a)
4676bddcd29dSMark Adams {
4677bddcd29dSMark Adams   constexpr int NWARP = BLOCK_SIZE/WARP_SIZE;
4678bddcd29dSMark Adams   __shared__ double buf[NWARP];
4679bddcd29dSMark Adams   int wid = threadIdx.x / WARP_SIZE;
4680bddcd29dSMark Adams   int laneid = threadIdx.x % WARP_SIZE;
4681bddcd29dSMark Adams   T b = wreduce<T>(a);
4682bddcd29dSMark Adams   if (laneid == 0)
4683bddcd29dSMark Adams     buf[wid] = b;
4684bddcd29dSMark Adams   __syncthreads();
4685bddcd29dSMark Adams   if (wid == 0) {
4686bddcd29dSMark Adams     if (threadIdx.x < NWARP)
4687bddcd29dSMark Adams       a = buf[threadIdx.x];
4688bddcd29dSMark Adams     else
4689bddcd29dSMark Adams       a = 0;
4690bddcd29dSMark Adams     for (int i = (NWARP+1)/2; i >= 1; i = i >> 1) {
4691bddcd29dSMark Adams       a += __shfl_down_sync(0xffffffff, a, i);
4692bddcd29dSMark Adams     }
4693bddcd29dSMark Adams   }
4694bddcd29dSMark Adams   return a;
4695bddcd29dSMark Adams }
4696bddcd29dSMark Adams 
4697bddcd29dSMark Adams 
4698bddcd29dSMark Adams // Band LU kernel ---  ba_csr bi_csr
4699bddcd29dSMark Adams template <int BLOCK_SIZE>
4700bddcd29dSMark Adams __global__
4701bddcd29dSMark Adams void __launch_bounds__(256,1)
4702bddcd29dSMark Adams mat_solve_band(const PetscInt n, const PetscInt bw, const PetscScalar ba_csr[], PetscScalar x[])
4703bddcd29dSMark Adams {
4704bddcd29dSMark Adams   const PetscInt    Nf = gridDim.x, nloc = n/Nf, field = blockIdx.x, start = field*nloc, end = start + nloc, chopnz = bw*(bw+1)/2, blocknz=(2*bw+1)*nloc, blocknz_0 = blocknz-chopnz;
4705bddcd29dSMark Adams   const PetscScalar *pLi;
4706bddcd29dSMark Adams   const int tid = threadIdx.x;
4707bddcd29dSMark Adams 
4708bddcd29dSMark Adams   /* Next, solve L */
4709bddcd29dSMark Adams   pLi = ba_csr + (field==0 ? 0 : blocknz_0 + (field-1)*blocknz + bw); // diagonal (0,0) in field
4710bddcd29dSMark Adams   for (int glbDD=start, locDD = 0; glbDD<end; glbDD++, locDD++) {
4711bddcd29dSMark Adams     const PetscInt col = locDD<bw ? start : (glbDD-bw);
4712bddcd29dSMark Adams     PetscScalar t = 0;
4713bddcd29dSMark Adams     for (int j=col+tid,idx=tid;j<glbDD;j+=blockDim.x,idx+=blockDim.x) {
4714bddcd29dSMark Adams       t += pLi[idx]*x[j];
4715bddcd29dSMark Adams     }
4716bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX)
4717bddcd29dSMark Adams     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4718bddcd29dSMark Adams     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4719bddcd29dSMark Adams     t = tt;
4720bddcd29dSMark Adams #else
4721bddcd29dSMark Adams     t = breduce<PetscReal,BLOCK_SIZE>(t);
4722bddcd29dSMark Adams #endif
4723bddcd29dSMark Adams     if (threadIdx.x == 0)
4724bddcd29dSMark Adams       x[glbDD] -= t; // /1.0
4725bddcd29dSMark Adams     __syncthreads();
4726bddcd29dSMark Adams     // inc
4727bddcd29dSMark Adams     pLi += glbDD-col; // get to diagonal
4728bddcd29dSMark Adams     if (glbDD > n-1-bw) pLi += n-1-glbDD; // skip over U, only last block has funny offset
4729bddcd29dSMark Adams     else pLi += bw;
4730bddcd29dSMark Adams     pLi += 1; // skip to next row
4731bddcd29dSMark Adams     if (field>0 && (locDD+1)<bw) pLi += bw-(locDD+1); // skip padding at beginning (ear)
4732bddcd29dSMark Adams   }
4733bddcd29dSMark Adams   /* Then, solve U */
4734bddcd29dSMark Adams   pLi = ba_csr + Nf*blocknz - 2*chopnz - 1; // end of real data on block (diagonal)
4735bddcd29dSMark Adams   if (field != Nf-1) pLi -= blocknz_0 + (Nf-2-field)*blocknz + bw; // diagonal of last local row
4736bddcd29dSMark Adams   for (int glbDD=end-1, locDD = 0; glbDD >= start; glbDD--, locDD++) {
4737bddcd29dSMark Adams     const PetscInt col = (locDD<bw) ? end-1 : glbDD+bw; // end of row in U
4738bddcd29dSMark Adams     PetscScalar t = 0;
4739bddcd29dSMark Adams     for (int j=col-tid,idx=tid;j>glbDD;j-=blockDim.x,idx+=blockDim.x) {
4740bddcd29dSMark Adams       t += pLi[-idx]*x[j];
4741bddcd29dSMark Adams     }
4742bddcd29dSMark Adams #if defined(PETSC_USE_COMPLEX)
4743bddcd29dSMark Adams     PetscReal tr = PetscRealPartComplex(t), ti = PetscImaginaryPartComplex(t);
4744bddcd29dSMark Adams     PetscScalar tt(breduce<PetscReal,BLOCK_SIZE>(tr), breduce<PetscReal,BLOCK_SIZE>(ti));
4745bddcd29dSMark Adams     t = tt;
4746bddcd29dSMark Adams #else
4747bddcd29dSMark Adams     t = breduce<PetscReal,BLOCK_SIZE>(PetscRealPart(t));
4748bddcd29dSMark Adams #endif
4749bddcd29dSMark Adams     pLi -= col-glbDD; // diagonal
4750bddcd29dSMark Adams     if (threadIdx.x == 0) {
4751bddcd29dSMark Adams       x[glbDD] -= t;
4752bddcd29dSMark Adams       x[glbDD] /= pLi[0];
4753bddcd29dSMark Adams     }
4754bddcd29dSMark Adams     __syncthreads();
4755bddcd29dSMark Adams     // inc past L to start of previous U
4756bddcd29dSMark Adams     pLi -= bw+1;
4757bddcd29dSMark Adams     if (glbDD<bw) pLi += bw-glbDD; // overshot in top left corner
4758bddcd29dSMark Adams     if (((locDD+1) < bw) && field != Nf-1) pLi -= (bw - (locDD+1)); // skip past right corner
4759bddcd29dSMark Adams   }
4760bddcd29dSMark Adams }
4761bddcd29dSMark Adams 
4762bddcd29dSMark Adams static PetscErrorCode MatSolve_SeqAIJCUSPARSEBAND(Mat A,Vec bb,Vec xx)
4763bddcd29dSMark Adams {
4764bddcd29dSMark Adams   const PetscScalar                     *barray;
4765bddcd29dSMark Adams   PetscScalar                           *xarray;
4766bddcd29dSMark Adams   thrust::device_ptr<const PetscScalar> bGPU;
4767bddcd29dSMark Adams   thrust::device_ptr<PetscScalar>       xGPU;
4768bddcd29dSMark Adams   Mat_SeqAIJCUSPARSETriFactors          *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors*)A->spptr;
4769bddcd29dSMark Adams   THRUSTARRAY                           *tempGPU = (THRUSTARRAY*)cusparseTriFactors->workVector;
4770bddcd29dSMark Adams   PetscInt                              n=A->rmap->n, nz=cusparseTriFactors->nnz, bw=(2*n-1 - (int)(PetscSqrtReal(1+4*(n*n-nz))+PETSC_MACHINE_EPSILON))/2, Nf;
4771bddcd29dSMark Adams   PetscErrorCode                        ierr;
4772bddcd29dSMark Adams   cudaError_t                           cerr;
4773bddcd29dSMark Adams   PetscContainer                        container;
4774bddcd29dSMark Adams 
4775bddcd29dSMark Adams   PetscFunctionBegin;
4776bddcd29dSMark Adams   if (A->rmap->n == 0) {
4777bddcd29dSMark Adams     PetscFunctionReturn(0);
4778bddcd29dSMark Adams   }
4779bddcd29dSMark Adams   // factor: get Nf if available
4780bddcd29dSMark Adams   ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr);
4781bddcd29dSMark Adams   if (container) {
4782bddcd29dSMark Adams     PetscInt *pNf=NULL;
4783bddcd29dSMark Adams     ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr);
4784bddcd29dSMark Adams     Nf = (*pNf)%1000;
4785bddcd29dSMark Adams   } else Nf = 1;
4786bddcd29dSMark Adams   if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf);
4787bddcd29dSMark Adams 
4788bddcd29dSMark Adams   /* Get the GPU pointers */
4789bddcd29dSMark Adams   ierr = VecCUDAGetArrayWrite(xx,&xarray);CHKERRQ(ierr);
4790bddcd29dSMark Adams   ierr = VecCUDAGetArrayRead(bb,&barray);CHKERRQ(ierr);
4791bddcd29dSMark Adams   xGPU = thrust::device_pointer_cast(xarray);
4792bddcd29dSMark Adams   bGPU = thrust::device_pointer_cast(barray);
4793bddcd29dSMark Adams 
4794bddcd29dSMark Adams   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
4795bddcd29dSMark Adams   /* First, reorder with the row permutation */
4796bddcd29dSMark Adams   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()),
4797bddcd29dSMark Adams                thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()),
4798bddcd29dSMark Adams                tempGPU->begin());
4799bddcd29dSMark Adams   constexpr int block = 128;
4800bddcd29dSMark Adams   mat_solve_band<block><<<Nf,block>>>(n,bw,cusparseTriFactors->a_band_d,tempGPU->data().get());
4801bddcd29dSMark Adams   CHECK_LAUNCH_ERROR(); // does a sync
4802bddcd29dSMark Adams 
4803bddcd29dSMark Adams   /* Last, reorder with the column permutation */
4804bddcd29dSMark Adams   thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream),thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()),
4805bddcd29dSMark Adams                thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()),
4806bddcd29dSMark Adams                xGPU);
4807bddcd29dSMark Adams 
4808bddcd29dSMark Adams   ierr = VecCUDARestoreArrayRead(bb,&barray);CHKERRQ(ierr);
4809bddcd29dSMark Adams   ierr = VecCUDARestoreArrayWrite(xx,&xarray);CHKERRQ(ierr);
4810bddcd29dSMark Adams   cerr = WaitForCUDA();CHKERRCUDA(cerr);
4811bddcd29dSMark Adams   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
4812bddcd29dSMark Adams   ierr = PetscLogGpuFlops(2.0*cusparseTriFactors->nnz - A->cmap->n);CHKERRQ(ierr);
4813bddcd29dSMark Adams   PetscFunctionReturn(0);
4814bddcd29dSMark Adams }
4815