1 /* 2 Defines the basic matrix operations for the AIJ (compressed row) 3 matrix storage format using the CUSPARSE library, 4 */ 5 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 6 7 #include <petscconf.h> 8 #include <../src/mat/impls/aij/seq/aij.h> /*I "petscmat.h" I*/ 9 #include <../src/mat/impls/sbaij/seq/sbaij.h> 10 #include <../src/vec/vec/impls/dvecimpl.h> 11 #include <petsc/private/vecimpl.h> 12 #undef VecType 13 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 14 #include <thrust/adjacent_difference.h> 15 #if PETSC_CPP_VERSION >= 14 16 #define PETSC_HAVE_THRUST_ASYNC 1 17 // thrust::for_each(thrust::cuda::par.on()) requires C++14 18 #endif 19 #include <thrust/iterator/constant_iterator.h> 20 #include <thrust/remove.h> 21 #include <thrust/sort.h> 22 #include <thrust/unique.h> 23 #if PETSC_PKG_CUDA_VERSION_GE(12, 9, 0) && !PetscDefined(HAVE_THRUST) 24 #include <cuda/std/functional> 25 #endif 26 27 const char *const MatCUSPARSEStorageFormats[] = {"CSR", "ELL", "HYB", "MatCUSPARSEStorageFormat", "MAT_CUSPARSE_", 0}; 28 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 29 /* 30 The following are copied from cusparse.h in CUDA-11.0. In MatCUSPARSESpMVAlgorithms[] etc, we copy them in 31 0-based integer value order, since we want to use PetscOptionsEnum() to parse user command line options for them. 32 */ 33 const char *const MatCUSPARSESpMVAlgorithms[] = {"MV_ALG_DEFAULT", "COOMV_ALG", "CSRMV_ALG1", "CSRMV_ALG2", "cusparseSpMVAlg_t", "CUSPARSE_", 0}; 34 const char *const MatCUSPARSESpMMAlgorithms[] = {"ALG_DEFAULT", "COO_ALG1", "COO_ALG2", "COO_ALG3", "CSR_ALG1", "COO_ALG4", "CSR_ALG2", "cusparseSpMMAlg_t", "CUSPARSE_SPMM_", 0}; 35 const char *const MatCUSPARSECsr2CscAlgorithms[] = {"INVALID" /*cusparse does not have enum 0! We created one*/, "ALG1", "ALG2", "cusparseCsr2CscAlg_t", "CUSPARSE_CSR2CSC_", 0}; 36 #endif 37 38 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 39 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, const MatFactorInfo *); 40 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat, Mat, const MatFactorInfo *); 41 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat, Mat, IS, IS, const MatFactorInfo *); 42 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 43 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat, Vec, Vec); 44 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 45 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 46 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat, Vec, Vec); 47 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **); 48 #endif 49 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat, PetscOptionItems PetscOptionsObject); 50 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat, PetscScalar, Mat, MatStructure); 51 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat, PetscScalar); 52 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat, Vec, Vec); 53 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 54 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 55 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 56 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat, Vec, Vec); 57 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec); 58 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat, Vec, Vec, Vec, PetscBool, PetscBool); 59 60 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **); 61 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **, MatCUSPARSEStorageFormat); 62 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **); 63 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat); 64 65 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat); 66 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat, PetscBool); 67 68 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat, PetscInt, const PetscInt[], PetscScalar[]); 69 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat, PetscCount, PetscInt[], PetscInt[]); 70 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat, const PetscScalar[], InsertMode); 71 72 PETSC_INTERN PetscErrorCode MatCUSPARSESetFormat_SeqAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 73 { 74 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 75 76 PetscFunctionBegin; 77 switch (op) { 78 case MAT_CUSPARSE_MULT: 79 cusparsestruct->format = format; 80 break; 81 case MAT_CUSPARSE_ALL: 82 cusparsestruct->format = format; 83 break; 84 default: 85 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. MAT_CUSPARSE_MULT and MAT_CUSPARSE_ALL are currently supported.", op); 86 } 87 PetscFunctionReturn(PETSC_SUCCESS); 88 } 89 90 /*@ 91 MatCUSPARSESetFormat - Sets the storage format of `MATSEQCUSPARSE` matrices for a particular 92 operation. Only the `MatMult()` operation can use different GPU storage formats 93 94 Not Collective 95 96 Input Parameters: 97 + A - Matrix of type `MATSEQAIJCUSPARSE` 98 . op - `MatCUSPARSEFormatOperation`. `MATSEQAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT` and `MAT_CUSPARSE_ALL`. 99 `MATMPIAIJCUSPARSE` matrices support `MAT_CUSPARSE_MULT_DIAG`,`MAT_CUSPARSE_MULT_OFFDIAG`, and `MAT_CUSPARSE_ALL`. 100 - format - `MatCUSPARSEStorageFormat` (one of `MAT_CUSPARSE_CSR`, `MAT_CUSPARSE_ELL`, `MAT_CUSPARSE_HYB`.) 101 102 Level: intermediate 103 104 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 105 @*/ 106 PetscErrorCode MatCUSPARSESetFormat(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 107 { 108 PetscFunctionBegin; 109 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 110 PetscTryMethod(A, "MatCUSPARSESetFormat_C", (Mat, MatCUSPARSEFormatOperation, MatCUSPARSEStorageFormat), (A, op, format)); 111 PetscFunctionReturn(PETSC_SUCCESS); 112 } 113 114 PETSC_INTERN PetscErrorCode MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE(Mat A, PetscBool use_cpu) 115 { 116 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 117 118 PetscFunctionBegin; 119 cusparsestruct->use_cpu_solve = use_cpu; 120 PetscFunctionReturn(PETSC_SUCCESS); 121 } 122 123 /*@ 124 MatCUSPARSESetUseCPUSolve - Sets to use CPU `MatSolve()`. 125 126 Input Parameters: 127 + A - Matrix of type `MATSEQAIJCUSPARSE` 128 - use_cpu - set flag for using the built-in CPU `MatSolve()` 129 130 Level: intermediate 131 132 Note: 133 The NVIDIA cuSPARSE LU solver currently computes the factors with the built-in CPU method 134 and moves the factors to the GPU for the solve. We have observed better performance keeping the data on the CPU and performing the solve there. 135 This method to specify if the solve is done on the CPU or GPU (GPU is the default). 136 137 .seealso: [](ch_matrices), `Mat`, `MatSolve()`, `MATSEQAIJCUSPARSE`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 138 @*/ 139 PetscErrorCode MatCUSPARSESetUseCPUSolve(Mat A, PetscBool use_cpu) 140 { 141 PetscFunctionBegin; 142 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 143 PetscTryMethod(A, "MatCUSPARSESetUseCPUSolve_C", (Mat, PetscBool), (A, use_cpu)); 144 PetscFunctionReturn(PETSC_SUCCESS); 145 } 146 147 static PetscErrorCode MatSetOption_SeqAIJCUSPARSE(Mat A, MatOption op, PetscBool flg) 148 { 149 PetscFunctionBegin; 150 switch (op) { 151 case MAT_FORM_EXPLICIT_TRANSPOSE: 152 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 153 if (A->form_explicit_transpose && !flg) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 154 A->form_explicit_transpose = flg; 155 break; 156 default: 157 PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 158 break; 159 } 160 PetscFunctionReturn(PETSC_SUCCESS); 161 } 162 163 static PetscErrorCode MatSetFromOptions_SeqAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject) 164 { 165 MatCUSPARSEStorageFormat format; 166 PetscBool flg; 167 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 168 169 PetscFunctionBegin; 170 PetscOptionsHeadBegin(PetscOptionsObject, "SeqAIJCUSPARSE options"); 171 if (A->factortype == MAT_FACTOR_NONE) { 172 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 173 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT, format)); 174 175 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of (seq)aijcusparse gpu matrices for SpMV and TriSolve", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparsestruct->format, (PetscEnum *)&format, &flg)); 176 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 177 PetscCall(PetscOptionsBool("-mat_cusparse_use_cpu_solve", "Use CPU (I)LU solve", "MatCUSPARSESetUseCPUSolve", cusparsestruct->use_cpu_solve, &cusparsestruct->use_cpu_solve, &flg)); 178 if (flg) PetscCall(MatCUSPARSESetUseCPUSolve(A, cusparsestruct->use_cpu_solve)); 179 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 180 PetscCall(PetscOptionsEnum("-mat_cusparse_spmv_alg", "sets cuSPARSE algorithm used in sparse-mat dense-vector multiplication (SpMV)", "cusparseSpMVAlg_t", MatCUSPARSESpMVAlgorithms, (PetscEnum)cusparsestruct->spmvAlg, (PetscEnum *)&cusparsestruct->spmvAlg, &flg)); 181 /* If user did use this option, check its consistency with cuSPARSE, since PetscOptionsEnum() sets enum values based on their position in MatCUSPARSESpMVAlgorithms[] */ 182 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 183 PetscCheck(!flg || CUSPARSE_SPMV_CSR_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 184 #else 185 PetscCheck(!flg || CUSPARSE_CSRMV_ALG1 == 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMVAlg_t has been changed but PETSc has not been updated accordingly"); 186 #endif 187 PetscCall(PetscOptionsEnum("-mat_cusparse_spmm_alg", "sets cuSPARSE algorithm used in sparse-mat dense-mat multiplication (SpMM)", "cusparseSpMMAlg_t", MatCUSPARSESpMMAlgorithms, (PetscEnum)cusparsestruct->spmmAlg, (PetscEnum *)&cusparsestruct->spmmAlg, &flg)); 188 PetscCheck(!flg || CUSPARSE_SPMM_CSR_ALG1 == 4, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseSpMMAlg_t has been changed but PETSc has not been updated accordingly"); 189 190 PetscCall( 191 PetscOptionsEnum("-mat_cusparse_csr2csc_alg", "sets cuSPARSE algorithm used in converting CSR matrices to CSC matrices", "cusparseCsr2CscAlg_t", MatCUSPARSECsr2CscAlgorithms, (PetscEnum)cusparsestruct->csr2cscAlg, (PetscEnum *)&cusparsestruct->csr2cscAlg, &flg)); 192 PetscCheck(!flg || CUSPARSE_CSR2CSC_ALG1 == 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE enum cusparseCsr2CscAlg_t has been changed but PETSc has not been updated accordingly"); 193 #endif 194 } 195 PetscOptionsHeadEnd(); 196 PetscFunctionReturn(PETSC_SUCCESS); 197 } 198 199 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 200 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(Mat A) 201 { 202 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 203 PetscInt m = A->rmap->n; 204 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 205 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 206 const MatScalar *Aa = a->a; 207 PetscInt *Mi, *Mj, Mnz; 208 PetscScalar *Ma; 209 210 PetscFunctionBegin; 211 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 212 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even when m=0 213 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host 214 Mnz = (Ai[m] - Ai[0]) + (Adiag[0] - Adiag[m]); // Lnz (without the unit diagonal) + Unz (with the non-unit diagonal) 215 PetscCall(PetscMalloc1(m + 1, &Mi)); 216 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj is temp 217 PetscCall(PetscMalloc1(Mnz, &Ma)); 218 Mi[0] = 0; 219 for (PetscInt i = 0; i < m; i++) { 220 PetscInt llen = Ai[i + 1] - Ai[i]; 221 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 222 PetscCall(PetscArraycpy(Mj + Mi[i], Aj + Ai[i], llen)); // entries of L 223 Mj[Mi[i] + llen] = i; // diagonal entry 224 PetscCall(PetscArraycpy(Mj + Mi[i] + llen + 1, Aj + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 225 Mi[i + 1] = Mi[i] + llen + ulen; 226 } 227 // Copy M (L,U) from host to device 228 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 229 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 230 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 231 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Mi, sizeof(*fs->csrRowPtr) * (m + 1), cudaMemcpyHostToDevice)); 232 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*fs->csrColIdx) * Mnz, cudaMemcpyHostToDevice)); 233 234 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 235 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 236 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 237 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 238 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 239 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_LOWER; 240 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; 241 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 242 243 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 244 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 245 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 246 247 fillMode = CUSPARSE_FILL_MODE_UPPER; 248 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 249 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 250 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 251 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 252 253 // Allocate work vectors in SpSv 254 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 255 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 256 257 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 258 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 259 260 // Query buffer sizes for SpSV and then allocate buffers, temporarily assuming opA = CUSPARSE_OPERATION_NON_TRANSPOSE 261 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 262 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 263 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 264 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 265 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 266 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 267 268 // Record for reuse 269 fs->csrRowPtr_h = Mi; 270 fs->csrVal_h = Ma; 271 PetscCall(PetscFree(Mj)); 272 } 273 // Copy the value 274 Mi = fs->csrRowPtr_h; 275 Ma = fs->csrVal_h; 276 Mnz = Mi[m]; 277 for (PetscInt i = 0; i < m; i++) { 278 PetscInt llen = Ai[i + 1] - Ai[i]; 279 PetscInt ulen = Adiag[i] - Adiag[i + 1]; 280 PetscCall(PetscArraycpy(Ma + Mi[i], Aa + Ai[i], llen)); // entries of L 281 Ma[Mi[i] + llen] = (MatScalar)1.0 / Aa[Adiag[i]]; // recover the diagonal entry 282 PetscCall(PetscArraycpy(Ma + Mi[i] + llen + 1, Aa + Adiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 283 } 284 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 285 286 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 287 if (fs->updatedSpSVAnalysis) { // have done cusparseSpSV_analysis before, and only matrix values changed? 288 // Otherwise cusparse would error out: "On entry to cusparseSpSV_updateMatrix() parameter number 3 (newValues) had an illegal value: NULL pointer" 289 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 290 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 291 } else 292 #endif 293 { 294 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 295 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 296 297 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 298 fs->updatedSpSVAnalysis = PETSC_TRUE; 299 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 300 } 301 } 302 PetscFunctionReturn(PETSC_SUCCESS); 303 } 304 #else 305 static PetscErrorCode MatSeqAIJCUSPARSEBuildILULowerTriMatrix(Mat A) 306 { 307 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 308 PetscInt n = A->rmap->n; 309 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 310 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 311 const PetscInt *ai = a->i, *aj = a->j, *vi; 312 const MatScalar *aa = a->a, *v; 313 PetscInt *AiLo, *AjLo; 314 PetscInt i, nz, nzLower, offset, rowOffset; 315 316 PetscFunctionBegin; 317 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 318 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 319 try { 320 /* first figure out the number of nonzeros in the lower triangular matrix including 1's on the diagonal. */ 321 nzLower = n + ai[n] - ai[1]; 322 if (!loTriFactor) { 323 PetscScalar *AALo; 324 325 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzLower * sizeof(PetscScalar))); 326 327 /* Allocate Space for the lower triangular matrix */ 328 PetscCallCUDA(cudaMallocHost((void **)&AiLo, (n + 1) * sizeof(PetscInt))); 329 PetscCallCUDA(cudaMallocHost((void **)&AjLo, nzLower * sizeof(PetscInt))); 330 331 /* Fill the lower triangular matrix */ 332 AiLo[0] = (PetscInt)0; 333 AiLo[n] = nzLower; 334 AjLo[0] = (PetscInt)0; 335 AALo[0] = (MatScalar)1.0; 336 v = aa; 337 vi = aj; 338 offset = 1; 339 rowOffset = 1; 340 for (i = 1; i < n; i++) { 341 nz = ai[i + 1] - ai[i]; 342 /* additional 1 for the term on the diagonal */ 343 AiLo[i] = rowOffset; 344 rowOffset += nz + 1; 345 346 PetscCall(PetscArraycpy(&AjLo[offset], vi, nz)); 347 PetscCall(PetscArraycpy(&AALo[offset], v, nz)); 348 349 offset += nz; 350 AjLo[offset] = (PetscInt)i; 351 AALo[offset] = (MatScalar)1.0; 352 offset += 1; 353 354 v += nz; 355 vi += nz; 356 } 357 358 /* allocate space for the triangular factor information */ 359 PetscCall(PetscNew(&loTriFactor)); 360 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 361 /* Create the matrix description */ 362 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 363 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 364 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 365 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 366 #else 367 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 368 #endif 369 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_LOWER)); 370 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 371 372 /* set the operation */ 373 loTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 374 375 /* set the matrix */ 376 loTriFactor->csrMat = new CsrMatrix; 377 loTriFactor->csrMat->num_rows = n; 378 loTriFactor->csrMat->num_cols = n; 379 loTriFactor->csrMat->num_entries = nzLower; 380 381 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 382 loTriFactor->csrMat->row_offsets->assign(AiLo, AiLo + n + 1); 383 384 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzLower); 385 loTriFactor->csrMat->column_indices->assign(AjLo, AjLo + nzLower); 386 387 loTriFactor->csrMat->values = new THRUSTARRAY(nzLower); 388 loTriFactor->csrMat->values->assign(AALo, AALo + nzLower); 389 390 /* Create the solve analysis information */ 391 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 392 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 393 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 394 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 395 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 396 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 397 #endif 398 399 /* perform the solve analysis */ 400 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 401 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 402 PetscCallCUDA(WaitForCUDA()); 403 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 404 405 /* assign the pointer */ 406 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 407 loTriFactor->AA_h = AALo; 408 PetscCallCUDA(cudaFreeHost(AiLo)); 409 PetscCallCUDA(cudaFreeHost(AjLo)); 410 PetscCall(PetscLogCpuToGpu((n + 1 + nzLower) * sizeof(int) + nzLower * sizeof(PetscScalar))); 411 } else { /* update values only */ 412 if (!loTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&loTriFactor->AA_h, nzLower * sizeof(PetscScalar))); 413 /* Fill the lower triangular matrix */ 414 loTriFactor->AA_h[0] = 1.0; 415 v = aa; 416 vi = aj; 417 offset = 1; 418 for (i = 1; i < n; i++) { 419 nz = ai[i + 1] - ai[i]; 420 PetscCall(PetscArraycpy(&loTriFactor->AA_h[offset], v, nz)); 421 offset += nz; 422 loTriFactor->AA_h[offset] = 1.0; 423 offset += 1; 424 v += nz; 425 } 426 loTriFactor->csrMat->values->assign(loTriFactor->AA_h, loTriFactor->AA_h + nzLower); 427 PetscCall(PetscLogCpuToGpu(nzLower * sizeof(PetscScalar))); 428 } 429 } catch (char *ex) { 430 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 431 } 432 } 433 PetscFunctionReturn(PETSC_SUCCESS); 434 } 435 436 static PetscErrorCode MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(Mat A) 437 { 438 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 439 PetscInt n = A->rmap->n; 440 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 441 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 442 const PetscInt *aj = a->j, *adiag = a->diag, *vi; 443 const MatScalar *aa = a->a, *v; 444 PetscInt *AiUp, *AjUp; 445 PetscInt i, nz, nzUpper, offset; 446 447 PetscFunctionBegin; 448 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 449 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 450 try { 451 /* next, figure out the number of nonzeros in the upper triangular matrix. */ 452 nzUpper = adiag[0] - adiag[n]; 453 if (!upTriFactor) { 454 PetscScalar *AAUp; 455 456 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 457 458 /* Allocate Space for the upper triangular matrix */ 459 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 460 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 461 462 /* Fill the upper triangular matrix */ 463 AiUp[0] = (PetscInt)0; 464 AiUp[n] = nzUpper; 465 offset = nzUpper; 466 for (i = n - 1; i >= 0; i--) { 467 v = aa + adiag[i + 1] + 1; 468 vi = aj + adiag[i + 1] + 1; 469 470 /* number of elements NOT on the diagonal */ 471 nz = adiag[i] - adiag[i + 1] - 1; 472 473 /* decrement the offset */ 474 offset -= (nz + 1); 475 476 /* first, set the diagonal elements */ 477 AjUp[offset] = (PetscInt)i; 478 AAUp[offset] = (MatScalar)1. / v[nz]; 479 AiUp[i] = AiUp[i + 1] - (nz + 1); 480 481 PetscCall(PetscArraycpy(&AjUp[offset + 1], vi, nz)); 482 PetscCall(PetscArraycpy(&AAUp[offset + 1], v, nz)); 483 } 484 485 /* allocate space for the triangular factor information */ 486 PetscCall(PetscNew(&upTriFactor)); 487 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 488 489 /* Create the matrix description */ 490 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 491 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 492 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 493 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 494 #else 495 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 496 #endif 497 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 498 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 499 500 /* set the operation */ 501 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 502 503 /* set the matrix */ 504 upTriFactor->csrMat = new CsrMatrix; 505 upTriFactor->csrMat->num_rows = n; 506 upTriFactor->csrMat->num_cols = n; 507 upTriFactor->csrMat->num_entries = nzUpper; 508 509 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(n + 1); 510 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + n + 1); 511 512 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(nzUpper); 513 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + nzUpper); 514 515 upTriFactor->csrMat->values = new THRUSTARRAY(nzUpper); 516 upTriFactor->csrMat->values->assign(AAUp, AAUp + nzUpper); 517 518 /* Create the solve analysis information */ 519 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 520 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 521 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 522 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 523 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 524 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 525 #endif 526 527 /* perform the solve analysis */ 528 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 529 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 530 531 PetscCallCUDA(WaitForCUDA()); 532 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 533 534 /* assign the pointer */ 535 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 536 upTriFactor->AA_h = AAUp; 537 PetscCallCUDA(cudaFreeHost(AiUp)); 538 PetscCallCUDA(cudaFreeHost(AjUp)); 539 PetscCall(PetscLogCpuToGpu((n + 1 + nzUpper) * sizeof(int) + nzUpper * sizeof(PetscScalar))); 540 } else { 541 if (!upTriFactor->AA_h) PetscCallCUDA(cudaMallocHost((void **)&upTriFactor->AA_h, nzUpper * sizeof(PetscScalar))); 542 /* Fill the upper triangular matrix */ 543 offset = nzUpper; 544 for (i = n - 1; i >= 0; i--) { 545 v = aa + adiag[i + 1] + 1; 546 547 /* number of elements NOT on the diagonal */ 548 nz = adiag[i] - adiag[i + 1] - 1; 549 550 /* decrement the offset */ 551 offset -= (nz + 1); 552 553 /* first, set the diagonal elements */ 554 upTriFactor->AA_h[offset] = 1. / v[nz]; 555 PetscCall(PetscArraycpy(&upTriFactor->AA_h[offset + 1], v, nz)); 556 } 557 upTriFactor->csrMat->values->assign(upTriFactor->AA_h, upTriFactor->AA_h + nzUpper); 558 PetscCall(PetscLogCpuToGpu(nzUpper * sizeof(PetscScalar))); 559 } 560 } catch (char *ex) { 561 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 562 } 563 } 564 PetscFunctionReturn(PETSC_SUCCESS); 565 } 566 #endif 567 568 static PetscErrorCode MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(Mat A) 569 { 570 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 571 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 572 IS isrow = a->row, isicol = a->icol; 573 PetscBool row_identity, col_identity; 574 PetscInt n = A->rmap->n; 575 576 PetscFunctionBegin; 577 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 578 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 579 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_LU(A)); 580 #else 581 PetscCall(MatSeqAIJCUSPARSEBuildILULowerTriMatrix(A)); 582 PetscCall(MatSeqAIJCUSPARSEBuildILUUpperTriMatrix(A)); 583 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 584 #endif 585 586 cusparseTriFactors->nnz = a->nz; 587 588 A->offloadmask = PETSC_OFFLOAD_BOTH; // factored matrix is sync'ed to GPU 589 /* lower triangular indices */ 590 PetscCall(ISIdentity(isrow, &row_identity)); 591 if (!row_identity && !cusparseTriFactors->rpermIndices) { 592 const PetscInt *r; 593 594 PetscCall(ISGetIndices(isrow, &r)); 595 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 596 cusparseTriFactors->rpermIndices->assign(r, r + n); 597 PetscCall(ISRestoreIndices(isrow, &r)); 598 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 599 } 600 601 /* upper triangular indices */ 602 PetscCall(ISIdentity(isicol, &col_identity)); 603 if (!col_identity && !cusparseTriFactors->cpermIndices) { 604 const PetscInt *c; 605 606 PetscCall(ISGetIndices(isicol, &c)); 607 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 608 cusparseTriFactors->cpermIndices->assign(c, c + n); 609 PetscCall(ISRestoreIndices(isicol, &c)); 610 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 611 } 612 PetscFunctionReturn(PETSC_SUCCESS); 613 } 614 615 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 616 static PetscErrorCode MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(Mat A) 617 { 618 Mat_SeqAIJ *a = static_cast<Mat_SeqAIJ *>(A->data); 619 PetscInt m = A->rmap->n; 620 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 621 const PetscInt *Ai = a->i, *Aj = a->j, *Adiag = a->diag; 622 const MatScalar *Aa = a->a; 623 PetscInt *Mj, Mnz; 624 PetscScalar *Ma, *D; 625 626 PetscFunctionBegin; 627 if (A->offloadmask == PETSC_OFFLOAD_CPU) { // A's latest factors are on CPU 628 if (!fs->csrRowPtr) { // Is't the first time to do the setup? Use csrRowPtr since it is not null even m=0 629 // Re-arrange the (skewed) factored matrix and put the result into M, a regular csr matrix on host. 630 // See comments at MatICCFactorSymbolic_SeqAIJ() on the layout of the factored matrix (U) on host. 631 Mnz = Ai[m]; // Unz (with the unit diagonal) 632 PetscCall(PetscMalloc1(Mnz, &Ma)); 633 PetscCall(PetscMalloc1(Mnz, &Mj)); // Mj[] is temp 634 PetscCall(PetscMalloc1(m, &D)); // the diagonal 635 for (PetscInt i = 0; i < m; i++) { 636 PetscInt ulen = Ai[i + 1] - Ai[i]; 637 Mj[Ai[i]] = i; // diagonal entry 638 PetscCall(PetscArraycpy(Mj + Ai[i] + 1, Aj + Ai[i], ulen - 1)); // entries of U on the right of the diagonal 639 } 640 // Copy M (U) from host to device 641 PetscCallCUDA(cudaMalloc(&fs->csrRowPtr, sizeof(*fs->csrRowPtr) * (m + 1))); 642 PetscCallCUDA(cudaMalloc(&fs->csrColIdx, sizeof(*fs->csrColIdx) * Mnz)); 643 PetscCallCUDA(cudaMalloc(&fs->csrVal, sizeof(*fs->csrVal) * Mnz)); 644 PetscCallCUDA(cudaMalloc(&fs->diag, sizeof(*fs->diag) * m)); 645 PetscCallCUDA(cudaMemcpy(fs->csrRowPtr, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyHostToDevice)); 646 PetscCallCUDA(cudaMemcpy(fs->csrColIdx, Mj, sizeof(*Mj) * Mnz, cudaMemcpyHostToDevice)); 647 648 // Create descriptors for L, U. See https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 649 // cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 650 // assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 651 // all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 652 // assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 653 cusparseFillMode_t fillMode = CUSPARSE_FILL_MODE_UPPER; 654 cusparseDiagType_t diagType = CUSPARSE_DIAG_TYPE_UNIT; // U is unit diagonal 655 const cusparseIndexType_t indexType = PetscDefined(USE_64BIT_INDICES) ? CUSPARSE_INDEX_64I : CUSPARSE_INDEX_32I; 656 657 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, Mnz, fs->csrRowPtr, fs->csrColIdx, fs->csrVal, indexType, indexType, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 658 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 659 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 660 661 // Allocate work vectors in SpSv 662 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(*fs->X) * m)); 663 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(*fs->Y) * m)); 664 665 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 666 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 667 668 // Query buffer sizes for SpSV and then allocate buffers 669 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 670 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 671 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 672 673 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); // Ut solve uses the same matrix (spMatDescr_U), but different descr and buffer 674 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 675 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 676 677 // Record for reuse 678 fs->csrVal_h = Ma; 679 fs->diag_h = D; 680 PetscCall(PetscFree(Mj)); 681 } 682 // Copy the value 683 Ma = fs->csrVal_h; 684 D = fs->diag_h; 685 Mnz = Ai[m]; 686 for (PetscInt i = 0; i < m; i++) { 687 D[i] = Aa[Adiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 688 Ma[Ai[i]] = (MatScalar)1.0; // set the unit diagonal, which is cosmetic since cusparse does not really read it given CUSPARSE_DIAG_TYPE_UNIT 689 for (PetscInt k = 0; k < Ai[i + 1] - Ai[i] - 1; k++) Ma[Ai[i] + 1 + k] = -Aa[Ai[i] + k]; 690 } 691 PetscCallCUDA(cudaMemcpy(fs->csrVal, Ma, sizeof(*Ma) * Mnz, cudaMemcpyHostToDevice)); 692 PetscCallCUDA(cudaMemcpy(fs->diag, D, sizeof(*D) * m, cudaMemcpyHostToDevice)); 693 694 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 695 if (fs->updatedSpSVAnalysis) { 696 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 697 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Ut, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 698 } else 699 #endif 700 { 701 // Do cusparseSpSV_analysis(), which is numeric and requires valid and up-to-date matrix values 702 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 703 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 704 fs->updatedSpSVAnalysis = PETSC_TRUE; 705 } 706 } 707 PetscFunctionReturn(PETSC_SUCCESS); 708 } 709 710 // Solve Ut D U x = b 711 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_Cholesky(Mat A, Vec b, Vec x) 712 { 713 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 714 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 715 const PetscScalar *barray; 716 PetscScalar *xarray; 717 thrust::device_ptr<const PetscScalar> bGPU; 718 thrust::device_ptr<PetscScalar> xGPU; 719 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 720 PetscInt m = A->rmap->n; 721 722 PetscFunctionBegin; 723 PetscCall(PetscLogGpuTimeBegin()); 724 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 725 PetscCall(VecCUDAGetArrayRead(b, &barray)); 726 xGPU = thrust::device_pointer_cast(xarray); 727 bGPU = thrust::device_pointer_cast(barray); 728 729 // Reorder b with the row permutation if needed, and wrap the result in fs->X 730 if (fs->rpermIndices) { 731 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 732 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 733 } else { 734 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 735 } 736 737 // Solve Ut Y = X 738 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 739 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 740 741 // Solve diag(D) Z = Y. Actually just do Y = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). 742 // It is basically a vector element-wise multiplication, but cublas does not have it! 743 PetscCallThrust(thrust::transform(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::device_pointer_cast(fs->Y), thrust::device_pointer_cast(fs->Y + m), thrust::device_pointer_cast(fs->diag), thrust::device_pointer_cast(fs->Y), thrust::multiplies<PetscScalar>())); 744 745 // Solve U X = Y 746 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 747 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 748 } else { 749 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 750 } 751 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 752 753 // Reorder X with the column permutation if needed, and put the result back to x 754 if (fs->cpermIndices) { 755 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 756 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 757 } 758 759 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 760 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 761 PetscCall(PetscLogGpuTimeEnd()); 762 PetscCall(PetscLogGpuFlops(4.0 * aij->nz - A->rmap->n)); 763 PetscFunctionReturn(PETSC_SUCCESS); 764 } 765 #else 766 static PetscErrorCode MatSeqAIJCUSPARSEBuildICCTriMatrices(Mat A) 767 { 768 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 769 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 770 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 771 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 772 PetscInt *AiUp, *AjUp; 773 PetscScalar *AAUp; 774 PetscScalar *AALo; 775 PetscInt nzUpper = a->nz, n = A->rmap->n, i, offset, nz, j; 776 Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)A->data; 777 const PetscInt *ai = b->i, *aj = b->j, *vj; 778 const MatScalar *aa = b->a, *v; 779 780 PetscFunctionBegin; 781 if (!n) PetscFunctionReturn(PETSC_SUCCESS); 782 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 783 try { 784 PetscCallCUDA(cudaMallocHost((void **)&AAUp, nzUpper * sizeof(PetscScalar))); 785 PetscCallCUDA(cudaMallocHost((void **)&AALo, nzUpper * sizeof(PetscScalar))); 786 if (!upTriFactor && !loTriFactor) { 787 /* Allocate Space for the upper triangular matrix */ 788 PetscCallCUDA(cudaMallocHost((void **)&AiUp, (n + 1) * sizeof(PetscInt))); 789 PetscCallCUDA(cudaMallocHost((void **)&AjUp, nzUpper * sizeof(PetscInt))); 790 791 /* Fill the upper triangular matrix */ 792 AiUp[0] = (PetscInt)0; 793 AiUp[n] = nzUpper; 794 offset = 0; 795 for (i = 0; i < n; i++) { 796 /* set the pointers */ 797 v = aa + ai[i]; 798 vj = aj + ai[i]; 799 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 800 801 /* first, set the diagonal elements */ 802 AjUp[offset] = (PetscInt)i; 803 AAUp[offset] = (MatScalar)1.0 / v[nz]; 804 AiUp[i] = offset; 805 AALo[offset] = (MatScalar)1.0 / v[nz]; 806 807 offset += 1; 808 if (nz > 0) { 809 PetscCall(PetscArraycpy(&AjUp[offset], vj, nz)); 810 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 811 for (j = offset; j < offset + nz; j++) { 812 AAUp[j] = -AAUp[j]; 813 AALo[j] = AAUp[j] / v[nz]; 814 } 815 offset += nz; 816 } 817 } 818 819 /* allocate space for the triangular factor information */ 820 PetscCall(PetscNew(&upTriFactor)); 821 upTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 822 823 /* Create the matrix description */ 824 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactor->descr)); 825 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 826 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 827 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 828 #else 829 PetscCallCUSPARSE(cusparseSetMatType(upTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 830 #endif 831 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 832 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactor->descr, CUSPARSE_DIAG_TYPE_UNIT)); 833 834 /* set the matrix */ 835 upTriFactor->csrMat = new CsrMatrix; 836 upTriFactor->csrMat->num_rows = A->rmap->n; 837 upTriFactor->csrMat->num_cols = A->cmap->n; 838 upTriFactor->csrMat->num_entries = a->nz; 839 840 upTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 841 upTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 842 843 upTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 844 upTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 845 846 upTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 847 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 848 849 /* set the operation */ 850 upTriFactor->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 851 852 /* Create the solve analysis information */ 853 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 854 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactor->solveInfo)); 855 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 856 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 857 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, &upTriFactor->solveBufferSize)); 858 PetscCallCUDA(cudaMalloc(&upTriFactor->solveBuffer, upTriFactor->solveBufferSize)); 859 #endif 860 861 /* perform the solve analysis */ 862 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 863 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 864 865 PetscCallCUDA(WaitForCUDA()); 866 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 867 868 /* assign the pointer */ 869 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtr = upTriFactor; 870 871 /* allocate space for the triangular factor information */ 872 PetscCall(PetscNew(&loTriFactor)); 873 loTriFactor->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 874 875 /* Create the matrix description */ 876 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactor->descr)); 877 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactor->descr, CUSPARSE_INDEX_BASE_ZERO)); 878 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 879 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 880 #else 881 PetscCallCUSPARSE(cusparseSetMatType(loTriFactor->descr, CUSPARSE_MATRIX_TYPE_TRIANGULAR)); 882 #endif 883 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactor->descr, CUSPARSE_FILL_MODE_UPPER)); 884 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactor->descr, CUSPARSE_DIAG_TYPE_NON_UNIT)); 885 886 /* set the operation */ 887 loTriFactor->solveOp = CUSPARSE_OPERATION_TRANSPOSE; 888 889 /* set the matrix */ 890 loTriFactor->csrMat = new CsrMatrix; 891 loTriFactor->csrMat->num_rows = A->rmap->n; 892 loTriFactor->csrMat->num_cols = A->cmap->n; 893 loTriFactor->csrMat->num_entries = a->nz; 894 895 loTriFactor->csrMat->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 896 loTriFactor->csrMat->row_offsets->assign(AiUp, AiUp + A->rmap->n + 1); 897 898 loTriFactor->csrMat->column_indices = new THRUSTINTARRAY32(a->nz); 899 loTriFactor->csrMat->column_indices->assign(AjUp, AjUp + a->nz); 900 901 loTriFactor->csrMat->values = new THRUSTARRAY(a->nz); 902 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 903 904 /* Create the solve analysis information */ 905 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 906 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactor->solveInfo)); 907 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 908 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 909 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, &loTriFactor->solveBufferSize)); 910 PetscCallCUDA(cudaMalloc(&loTriFactor->solveBuffer, loTriFactor->solveBufferSize)); 911 #endif 912 913 /* perform the solve analysis */ 914 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 915 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 916 917 PetscCallCUDA(WaitForCUDA()); 918 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 919 920 /* assign the pointer */ 921 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtr = loTriFactor; 922 923 PetscCall(PetscLogCpuToGpu(2 * (((A->rmap->n + 1) + (a->nz)) * sizeof(int) + (a->nz) * sizeof(PetscScalar)))); 924 PetscCallCUDA(cudaFreeHost(AiUp)); 925 PetscCallCUDA(cudaFreeHost(AjUp)); 926 } else { 927 /* Fill the upper triangular matrix */ 928 offset = 0; 929 for (i = 0; i < n; i++) { 930 /* set the pointers */ 931 v = aa + ai[i]; 932 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 933 934 /* first, set the diagonal elements */ 935 AAUp[offset] = 1.0 / v[nz]; 936 AALo[offset] = 1.0 / v[nz]; 937 938 offset += 1; 939 if (nz > 0) { 940 PetscCall(PetscArraycpy(&AAUp[offset], v, nz)); 941 for (j = offset; j < offset + nz; j++) { 942 AAUp[j] = -AAUp[j]; 943 AALo[j] = AAUp[j] / v[nz]; 944 } 945 offset += nz; 946 } 947 } 948 PetscCheck(upTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 949 PetscCheck(loTriFactor, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 950 upTriFactor->csrMat->values->assign(AAUp, AAUp + a->nz); 951 loTriFactor->csrMat->values->assign(AALo, AALo + a->nz); 952 PetscCall(PetscLogCpuToGpu(2 * (a->nz) * sizeof(PetscScalar))); 953 } 954 PetscCallCUDA(cudaFreeHost(AAUp)); 955 PetscCallCUDA(cudaFreeHost(AALo)); 956 } catch (char *ex) { 957 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 958 } 959 } 960 PetscFunctionReturn(PETSC_SUCCESS); 961 } 962 #endif 963 964 static PetscErrorCode MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(Mat A) 965 { 966 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 967 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 968 IS ip = a->row; 969 PetscBool perm_identity; 970 PetscInt n = A->rmap->n; 971 972 PetscFunctionBegin; 973 PetscCheck(cusparseTriFactors, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing cusparseTriFactors"); 974 975 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 976 PetscCall(MatSeqAIJCUSPARSEBuildFactoredMatrix_Cheolesky(A)); 977 #else 978 PetscCall(MatSeqAIJCUSPARSEBuildICCTriMatrices(A)); 979 if (!cusparseTriFactors->workVector) cusparseTriFactors->workVector = new THRUSTARRAY(n); 980 #endif 981 cusparseTriFactors->nnz = (a->nz - n) * 2 + n; 982 983 A->offloadmask = PETSC_OFFLOAD_BOTH; 984 985 /* lower triangular indices */ 986 PetscCall(ISIdentity(ip, &perm_identity)); 987 if (!perm_identity) { 988 IS iip; 989 const PetscInt *irip, *rip; 990 991 PetscCall(ISInvertPermutation(ip, PETSC_DECIDE, &iip)); 992 PetscCall(ISGetIndices(iip, &irip)); 993 PetscCall(ISGetIndices(ip, &rip)); 994 cusparseTriFactors->rpermIndices = new THRUSTINTARRAY(n); 995 cusparseTriFactors->rpermIndices->assign(rip, rip + n); 996 cusparseTriFactors->cpermIndices = new THRUSTINTARRAY(n); 997 cusparseTriFactors->cpermIndices->assign(irip, irip + n); 998 PetscCall(ISRestoreIndices(iip, &irip)); 999 PetscCall(ISDestroy(&iip)); 1000 PetscCall(ISRestoreIndices(ip, &rip)); 1001 PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1002 } 1003 PetscFunctionReturn(PETSC_SUCCESS); 1004 } 1005 1006 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 1007 { 1008 PetscFunctionBegin; 1009 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 1010 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 1011 B->offloadmask = PETSC_OFFLOAD_CPU; 1012 1013 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1014 B->ops->solve = MatSolve_SeqAIJCUSPARSE_Cholesky; 1015 B->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_Cholesky; 1016 #else 1017 /* determine which version of MatSolve needs to be used. */ 1018 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 1019 IS ip = b->row; 1020 PetscBool perm_identity; 1021 1022 PetscCall(ISIdentity(ip, &perm_identity)); 1023 if (perm_identity) { 1024 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 1025 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 1026 } else { 1027 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 1028 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 1029 } 1030 #endif 1031 B->ops->matsolve = NULL; 1032 B->ops->matsolvetranspose = NULL; 1033 1034 /* get the triangular factors */ 1035 PetscCall(MatSeqAIJCUSPARSEICCAnalysisAndCopyToGPU(B)); 1036 PetscFunctionReturn(PETSC_SUCCESS); 1037 } 1038 1039 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 1040 static PetscErrorCode MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(Mat A) 1041 { 1042 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1043 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1044 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1045 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT; 1046 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT; 1047 cusparseIndexBase_t indexBase; 1048 cusparseMatrixType_t matrixType; 1049 cusparseFillMode_t fillMode; 1050 cusparseDiagType_t diagType; 1051 1052 PetscFunctionBegin; 1053 /* allocate space for the transpose of the lower triangular factor */ 1054 PetscCall(PetscNew(&loTriFactorT)); 1055 loTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1056 1057 /* set the matrix descriptors of the lower triangular factor */ 1058 matrixType = cusparseGetMatType(loTriFactor->descr); 1059 indexBase = cusparseGetMatIndexBase(loTriFactor->descr); 1060 fillMode = cusparseGetMatFillMode(loTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1061 diagType = cusparseGetMatDiagType(loTriFactor->descr); 1062 1063 /* Create the matrix description */ 1064 PetscCallCUSPARSE(cusparseCreateMatDescr(&loTriFactorT->descr)); 1065 PetscCallCUSPARSE(cusparseSetMatIndexBase(loTriFactorT->descr, indexBase)); 1066 PetscCallCUSPARSE(cusparseSetMatType(loTriFactorT->descr, matrixType)); 1067 PetscCallCUSPARSE(cusparseSetMatFillMode(loTriFactorT->descr, fillMode)); 1068 PetscCallCUSPARSE(cusparseSetMatDiagType(loTriFactorT->descr, diagType)); 1069 1070 /* set the operation */ 1071 loTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1072 1073 /* allocate GPU space for the CSC of the lower triangular factor*/ 1074 loTriFactorT->csrMat = new CsrMatrix; 1075 loTriFactorT->csrMat->num_rows = loTriFactor->csrMat->num_cols; 1076 loTriFactorT->csrMat->num_cols = loTriFactor->csrMat->num_rows; 1077 loTriFactorT->csrMat->num_entries = loTriFactor->csrMat->num_entries; 1078 loTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_rows + 1); 1079 loTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(loTriFactorT->csrMat->num_entries); 1080 loTriFactorT->csrMat->values = new THRUSTARRAY(loTriFactorT->csrMat->num_entries); 1081 1082 /* compute the transpose of the lower triangular factor, i.e. the CSC */ 1083 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1084 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), 1085 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), 1086 loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &loTriFactor->csr2cscBufferSize)); 1087 PetscCallCUDA(cudaMalloc(&loTriFactor->csr2cscBuffer, loTriFactor->csr2cscBufferSize)); 1088 #endif 1089 1090 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1091 { 1092 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1093 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_cols, loTriFactor->csrMat->num_entries, loTriFactor->csrMat->values->data().get(), loTriFactor->csrMat->row_offsets->data().get(), 1094 loTriFactor->csrMat->column_indices->data().get(), loTriFactorT->csrMat->values->data().get(), 1095 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1096 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, loTriFactor->csr2cscBuffer); 1097 #else 1098 loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1099 #endif 1100 PetscCallCUSPARSE(stat); 1101 } 1102 1103 PetscCallCUDA(WaitForCUDA()); 1104 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1105 1106 /* Create the solve analysis information */ 1107 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1108 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&loTriFactorT->solveInfo)); 1109 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1110 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1111 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, &loTriFactorT->solveBufferSize)); 1112 PetscCallCUDA(cudaMalloc(&loTriFactorT->solveBuffer, loTriFactorT->solveBufferSize)); 1113 #endif 1114 1115 /* perform the solve analysis */ 1116 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1117 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1118 1119 PetscCallCUDA(WaitForCUDA()); 1120 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1121 1122 /* assign the pointer */ 1123 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->loTriFactorPtrTranspose = loTriFactorT; 1124 1125 /*********************************************/ 1126 /* Now the Transpose of the Upper Tri Factor */ 1127 /*********************************************/ 1128 1129 /* allocate space for the transpose of the upper triangular factor */ 1130 PetscCall(PetscNew(&upTriFactorT)); 1131 upTriFactorT->solvePolicy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1132 1133 /* set the matrix descriptors of the upper triangular factor */ 1134 matrixType = cusparseGetMatType(upTriFactor->descr); 1135 indexBase = cusparseGetMatIndexBase(upTriFactor->descr); 1136 fillMode = cusparseGetMatFillMode(upTriFactor->descr) == CUSPARSE_FILL_MODE_UPPER ? CUSPARSE_FILL_MODE_LOWER : CUSPARSE_FILL_MODE_UPPER; 1137 diagType = cusparseGetMatDiagType(upTriFactor->descr); 1138 1139 /* Create the matrix description */ 1140 PetscCallCUSPARSE(cusparseCreateMatDescr(&upTriFactorT->descr)); 1141 PetscCallCUSPARSE(cusparseSetMatIndexBase(upTriFactorT->descr, indexBase)); 1142 PetscCallCUSPARSE(cusparseSetMatType(upTriFactorT->descr, matrixType)); 1143 PetscCallCUSPARSE(cusparseSetMatFillMode(upTriFactorT->descr, fillMode)); 1144 PetscCallCUSPARSE(cusparseSetMatDiagType(upTriFactorT->descr, diagType)); 1145 1146 /* set the operation */ 1147 upTriFactorT->solveOp = CUSPARSE_OPERATION_NON_TRANSPOSE; 1148 1149 /* allocate GPU space for the CSC of the upper triangular factor*/ 1150 upTriFactorT->csrMat = new CsrMatrix; 1151 upTriFactorT->csrMat->num_rows = upTriFactor->csrMat->num_cols; 1152 upTriFactorT->csrMat->num_cols = upTriFactor->csrMat->num_rows; 1153 upTriFactorT->csrMat->num_entries = upTriFactor->csrMat->num_entries; 1154 upTriFactorT->csrMat->row_offsets = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_rows + 1); 1155 upTriFactorT->csrMat->column_indices = new THRUSTINTARRAY32(upTriFactorT->csrMat->num_entries); 1156 upTriFactorT->csrMat->values = new THRUSTARRAY(upTriFactorT->csrMat->num_entries); 1157 1158 /* compute the transpose of the upper triangular factor, i.e. the CSC */ 1159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1160 PetscCallCUSPARSE(cusparseCsr2cscEx2_bufferSize(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), 1161 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), 1162 upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, &upTriFactor->csr2cscBufferSize)); 1163 PetscCallCUDA(cudaMalloc(&upTriFactor->csr2cscBuffer, upTriFactor->csr2cscBufferSize)); 1164 #endif 1165 1166 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1167 { 1168 // there is no clean way to have PetscCallCUSPARSE wrapping this function... 1169 auto stat = cusparse_csr2csc(cusparseTriFactors->handle, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_cols, upTriFactor->csrMat->num_entries, upTriFactor->csrMat->values->data().get(), upTriFactor->csrMat->row_offsets->data().get(), 1170 upTriFactor->csrMat->column_indices->data().get(), upTriFactorT->csrMat->values->data().get(), 1171 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1172 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, CUSPARSE_CSR2CSC_ALG1, upTriFactor->csr2cscBuffer); 1173 #else 1174 upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->csrMat->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1175 #endif 1176 PetscCallCUSPARSE(stat); 1177 } 1178 1179 PetscCallCUDA(WaitForCUDA()); 1180 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1181 1182 /* Create the solve analysis information */ 1183 PetscCall(PetscLogEventBegin(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1184 PetscCallCUSPARSE(cusparseCreateCsrsvInfo(&upTriFactorT->solveInfo)); 1185 #if PETSC_PKG_CUDA_VERSION_GE(9, 0, 0) 1186 PetscCallCUSPARSE(cusparseXcsrsv_buffsize(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1187 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, &upTriFactorT->solveBufferSize)); 1188 PetscCallCUDA(cudaMalloc(&upTriFactorT->solveBuffer, upTriFactorT->solveBufferSize)); 1189 #endif 1190 1191 /* perform the solve analysis */ 1192 /* christ, would it have killed you to put this stuff in a function????????? */ 1193 PetscCallCUSPARSE(cusparseXcsrsv_analysis(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1194 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1195 1196 PetscCallCUDA(WaitForCUDA()); 1197 PetscCall(PetscLogEventEnd(MAT_CUSPARSESolveAnalysis, A, 0, 0, 0)); 1198 1199 /* assign the pointer */ 1200 ((Mat_SeqAIJCUSPARSETriFactors *)A->spptr)->upTriFactorPtrTranspose = upTriFactorT; 1201 PetscFunctionReturn(PETSC_SUCCESS); 1202 } 1203 #endif 1204 1205 struct PetscScalarToPetscInt { 1206 __host__ __device__ PetscInt operator()(PetscScalar s) { return (PetscInt)PetscRealPart(s); } 1207 }; 1208 1209 static PetscErrorCode MatSeqAIJCUSPARSEFormExplicitTranspose(Mat A) 1210 { 1211 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 1212 Mat_SeqAIJCUSPARSEMultStruct *matstruct, *matstructT; 1213 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1214 cusparseStatus_t stat; 1215 cusparseIndexBase_t indexBase; 1216 1217 PetscFunctionBegin; 1218 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1219 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 1220 PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing mat struct"); 1221 matstructT = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 1222 PetscCheck(!A->transupdated || matstructT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing matTranspose struct"); 1223 if (A->transupdated) PetscFunctionReturn(PETSC_SUCCESS); 1224 PetscCall(PetscLogEventBegin(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1225 PetscCall(PetscLogGpuTimeBegin()); 1226 if (cusparsestruct->format != MAT_CUSPARSE_CSR) PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 1227 if (!cusparsestruct->matTranspose) { /* create cusparse matrix */ 1228 matstructT = new Mat_SeqAIJCUSPARSEMultStruct; 1229 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstructT->descr)); 1230 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1231 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstructT->descr, indexBase)); 1232 PetscCallCUSPARSE(cusparseSetMatType(matstructT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 1233 1234 /* set alpha and beta */ 1235 PetscCallCUDA(cudaMalloc((void **)&matstructT->alpha_one, sizeof(PetscScalar))); 1236 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_zero, sizeof(PetscScalar))); 1237 PetscCallCUDA(cudaMalloc((void **)&matstructT->beta_one, sizeof(PetscScalar))); 1238 PetscCallCUDA(cudaMemcpy(matstructT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1239 PetscCallCUDA(cudaMemcpy(matstructT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1240 PetscCallCUDA(cudaMemcpy(matstructT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 1241 1242 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 1243 CsrMatrix *matrixT = new CsrMatrix; 1244 matstructT->mat = matrixT; 1245 matrixT->num_rows = A->cmap->n; 1246 matrixT->num_cols = A->rmap->n; 1247 matrixT->num_entries = a->nz; 1248 matrixT->row_offsets = new THRUSTINTARRAY32(matrixT->num_rows + 1); 1249 matrixT->column_indices = new THRUSTINTARRAY32(a->nz); 1250 matrixT->values = new THRUSTARRAY(a->nz); 1251 1252 if (!cusparsestruct->rowoffsets_gpu) cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1253 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1254 1255 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1256 #if PETSC_PKG_CUDA_VERSION_GE(11, 2, 1) 1257 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx type due to THRUSTINTARRAY32 */ 1258 indexBase, cusparse_scalartype); 1259 PetscCallCUSPARSE(stat); 1260 #else 1261 /* cusparse-11.x returns errors with zero-sized matrices until 11.2.1, 1262 see https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cusparse-11.2.1 1263 1264 I don't know what a proper value should be for matstructT->matDescr with empty matrices, so I just set 1265 it to NULL to blow it up if one relies on it. Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, 1266 when nnz = 0, matrixT->row_offsets[] should be filled with indexBase. So I also set it accordingly. 1267 */ 1268 if (matrixT->num_entries) { 1269 stat = cusparseCreateCsr(&matstructT->matDescr, matrixT->num_rows, matrixT->num_cols, matrixT->num_entries, matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), matrixT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, indexBase, cusparse_scalartype); 1270 PetscCallCUSPARSE(stat); 1271 1272 } else { 1273 matstructT->matDescr = NULL; 1274 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1275 } 1276 #endif 1277 #endif 1278 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 1279 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1280 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 1281 #else 1282 CsrMatrix *temp = new CsrMatrix; 1283 CsrMatrix *tempT = new CsrMatrix; 1284 /* First convert HYB to CSR */ 1285 temp->num_rows = A->rmap->n; 1286 temp->num_cols = A->cmap->n; 1287 temp->num_entries = a->nz; 1288 temp->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1289 temp->column_indices = new THRUSTINTARRAY32(a->nz); 1290 temp->values = new THRUSTARRAY(a->nz); 1291 1292 stat = cusparse_hyb2csr(cusparsestruct->handle, matstruct->descr, (cusparseHybMat_t)matstruct->mat, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get()); 1293 PetscCallCUSPARSE(stat); 1294 1295 /* Next, convert CSR to CSC (i.e. the matrix transpose) */ 1296 tempT->num_rows = A->rmap->n; 1297 tempT->num_cols = A->cmap->n; 1298 tempT->num_entries = a->nz; 1299 tempT->row_offsets = new THRUSTINTARRAY32(A->rmap->n + 1); 1300 tempT->column_indices = new THRUSTINTARRAY32(a->nz); 1301 tempT->values = new THRUSTARRAY(a->nz); 1302 1303 stat = cusparse_csr2csc(cusparsestruct->handle, temp->num_rows, temp->num_cols, temp->num_entries, temp->values->data().get(), temp->row_offsets->data().get(), temp->column_indices->data().get(), tempT->values->data().get(), 1304 tempT->column_indices->data().get(), tempT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1305 PetscCallCUSPARSE(stat); 1306 1307 /* Last, convert CSC to HYB */ 1308 cusparseHybMat_t hybMat; 1309 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 1310 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 1311 stat = cusparse_csr2hyb(cusparsestruct->handle, A->rmap->n, A->cmap->n, matstructT->descr, tempT->values->data().get(), tempT->row_offsets->data().get(), tempT->column_indices->data().get(), hybMat, 0, partition); 1312 PetscCallCUSPARSE(stat); 1313 1314 /* assign the pointer */ 1315 matstructT->mat = hybMat; 1316 A->transupdated = PETSC_TRUE; 1317 /* delete temporaries */ 1318 if (tempT) { 1319 if (tempT->values) delete (THRUSTARRAY *)tempT->values; 1320 if (tempT->column_indices) delete (THRUSTINTARRAY32 *)tempT->column_indices; 1321 if (tempT->row_offsets) delete (THRUSTINTARRAY32 *)tempT->row_offsets; 1322 delete (CsrMatrix *)tempT; 1323 } 1324 if (temp) { 1325 if (temp->values) delete (THRUSTARRAY *)temp->values; 1326 if (temp->column_indices) delete (THRUSTINTARRAY32 *)temp->column_indices; 1327 if (temp->row_offsets) delete (THRUSTINTARRAY32 *)temp->row_offsets; 1328 delete (CsrMatrix *)temp; 1329 } 1330 #endif 1331 } 1332 } 1333 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { /* transpose mat struct may be already present, update data */ 1334 CsrMatrix *matrix = (CsrMatrix *)matstruct->mat; 1335 CsrMatrix *matrixT = (CsrMatrix *)matstructT->mat; 1336 PetscCheck(matrix, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix"); 1337 PetscCheck(matrix->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix rows"); 1338 PetscCheck(matrix->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix cols"); 1339 PetscCheck(matrix->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrix values"); 1340 PetscCheck(matrixT, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT"); 1341 PetscCheck(matrixT->row_offsets, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT rows"); 1342 PetscCheck(matrixT->column_indices, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT cols"); 1343 PetscCheck(matrixT->values, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CsrMatrixT values"); 1344 if (!cusparsestruct->rowoffsets_gpu) { /* this may be absent when we did not construct the transpose with csr2csc */ 1345 cusparsestruct->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 1346 cusparsestruct->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 1347 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 1348 } 1349 if (!cusparsestruct->csr2csc_i) { 1350 THRUSTARRAY csr2csc_a(matrix->num_entries); 1351 PetscCallThrust(thrust::sequence(thrust::device, csr2csc_a.begin(), csr2csc_a.end(), 0.0)); 1352 1353 indexBase = cusparseGetMatIndexBase(matstruct->descr); 1354 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1355 void *csr2cscBuffer; 1356 size_t csr2cscBufferSize; 1357 stat = cusparseCsr2cscEx2_bufferSize(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, matrix->values->data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1358 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, &csr2cscBufferSize); 1359 PetscCallCUSPARSE(stat); 1360 PetscCallCUDA(cudaMalloc(&csr2cscBuffer, csr2cscBufferSize)); 1361 #endif 1362 1363 if (matrix->num_entries) { 1364 /* When there are no nonzeros, this routine mistakenly returns CUSPARSE_STATUS_INVALID_VALUE in 1365 mat_tests-ex62_15_mpiaijcusparse on ranks 0 and 2 with CUDA-11. But CUDA-10 is OK. 1366 I checked every parameters and they were just fine. I have no clue why cusparse complains. 1367 1368 Per https://docs.nvidia.com/cuda/cusparse/index.html#csr2cscEx2, when nnz = 0, matrixT->row_offsets[] 1369 should be filled with indexBase. So I just take a shortcut here. 1370 */ 1371 stat = cusparse_csr2csc(cusparsestruct->handle, A->rmap->n, A->cmap->n, matrix->num_entries, csr2csc_a.data().get(), cusparsestruct->rowoffsets_gpu->data().get(), matrix->column_indices->data().get(), matrixT->values->data().get(), 1372 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1373 matrixT->row_offsets->data().get(), matrixT->column_indices->data().get(), cusparse_scalartype, CUSPARSE_ACTION_NUMERIC, indexBase, cusparsestruct->csr2cscAlg, csr2cscBuffer); 1374 PetscCallCUSPARSE(stat); 1375 #else 1376 matrixT->column_indices->data().get(), matrixT->row_offsets->data().get(), CUSPARSE_ACTION_NUMERIC, indexBase); 1377 PetscCallCUSPARSE(stat); 1378 #endif 1379 } else { 1380 matrixT->row_offsets->assign(matrixT->row_offsets->size(), indexBase); 1381 } 1382 1383 cusparsestruct->csr2csc_i = new THRUSTINTARRAY(matrix->num_entries); 1384 PetscCallThrust(thrust::transform(thrust::device, matrixT->values->begin(), matrixT->values->end(), cusparsestruct->csr2csc_i->begin(), PetscScalarToPetscInt())); 1385 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 1386 PetscCallCUDA(cudaFree(csr2cscBuffer)); 1387 #endif 1388 } 1389 PetscCallThrust( 1390 thrust::copy(thrust::device, thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->begin()), thrust::make_permutation_iterator(matrix->values->begin(), cusparsestruct->csr2csc_i->end()), matrixT->values->begin())); 1391 } 1392 PetscCall(PetscLogGpuTimeEnd()); 1393 PetscCall(PetscLogEventEnd(MAT_CUSPARSEGenerateTranspose, A, 0, 0, 0)); 1394 /* the compressed row indices is not used for matTranspose */ 1395 matstructT->cprowIndices = NULL; 1396 /* assign the pointer */ 1397 ((Mat_SeqAIJCUSPARSE *)A->spptr)->matTranspose = matstructT; 1398 A->transupdated = PETSC_TRUE; 1399 PetscFunctionReturn(PETSC_SUCCESS); 1400 } 1401 1402 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1403 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1404 { 1405 const PetscScalar *barray; 1406 PetscScalar *xarray; 1407 thrust::device_ptr<const PetscScalar> bGPU; 1408 thrust::device_ptr<PetscScalar> xGPU; 1409 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1410 const Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1411 const cusparseOperation_t op = CUSPARSE_OPERATION_NON_TRANSPOSE; 1412 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1413 PetscInt m = A->rmap->n; 1414 1415 PetscFunctionBegin; 1416 PetscCall(PetscLogGpuTimeBegin()); 1417 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1418 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1419 xGPU = thrust::device_pointer_cast(xarray); 1420 bGPU = thrust::device_pointer_cast(barray); 1421 1422 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1423 if (fs->rpermIndices) { 1424 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1425 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1426 } else { 1427 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1428 } 1429 1430 // Solve L Y = X 1431 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1432 // Note that cusparseSpSV_solve() secretly uses the external buffer used in cusparseSpSV_analysis()! 1433 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_L)); 1434 1435 // Solve U X = Y 1436 if (fs->cpermIndices) { 1437 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1438 } else { 1439 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1440 } 1441 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, op, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_U)); 1442 1443 // Reorder X with the column permutation if needed, and put the result back to x 1444 if (fs->cpermIndices) { 1445 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1446 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1447 } 1448 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1449 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1450 PetscCall(PetscLogGpuTimeEnd()); 1451 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - m)); 1452 PetscFunctionReturn(PETSC_SUCCESS); 1453 } 1454 1455 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_LU(Mat A, Vec b, Vec x) 1456 { 1457 Mat_SeqAIJCUSPARSETriFactors *fs = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(A->spptr); 1458 Mat_SeqAIJ *aij = static_cast<Mat_SeqAIJ *>(A->data); 1459 const PetscScalar *barray; 1460 PetscScalar *xarray; 1461 thrust::device_ptr<const PetscScalar> bGPU; 1462 thrust::device_ptr<PetscScalar> xGPU; 1463 const cusparseOperation_t opA = CUSPARSE_OPERATION_TRANSPOSE; 1464 const cusparseSpSVAlg_t alg = CUSPARSE_SPSV_ALG_DEFAULT; 1465 PetscInt m = A->rmap->n; 1466 1467 PetscFunctionBegin; 1468 PetscCall(PetscLogGpuTimeBegin()); 1469 if (!fs->createdTransposeSpSVDescr) { // Call MatSolveTranspose() for the first time 1470 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 1471 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* The matrix is still L. We only do transpose solve with it */ 1472 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 1473 1474 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Ut)); 1475 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, &fs->spsvBufferSize_Ut)); 1476 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 1477 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Ut, fs->spsvBufferSize_Ut)); 1478 fs->createdTransposeSpSVDescr = PETSC_TRUE; 1479 } 1480 1481 if (!fs->updatedTransposeSpSVAnalysis) { 1482 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1483 1484 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut, fs->spsvBuffer_Ut)); 1485 fs->updatedTransposeSpSVAnalysis = PETSC_TRUE; 1486 } 1487 1488 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1489 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1490 xGPU = thrust::device_pointer_cast(xarray); 1491 bGPU = thrust::device_pointer_cast(barray); 1492 1493 // Reorder b with the row permutation if needed, and wrap the result in fs->X 1494 if (fs->rpermIndices) { 1495 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, fs->rpermIndices->end()), thrust::device_pointer_cast(fs->X))); 1496 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1497 } else { 1498 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1499 } 1500 1501 // Solve Ut Y = X 1502 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1503 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, alg, fs->spsvDescr_Ut)); 1504 1505 // Solve Lt X = Y 1506 if (fs->cpermIndices) { // if need to permute, we need to use the intermediate buffer X 1507 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, fs->X)); 1508 } else { 1509 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1510 } 1511 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, opA, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, alg, fs->spsvDescr_Lt)); 1512 1513 // Reorder X with the column permutation if needed, and put the result back to x 1514 if (fs->cpermIndices) { 1515 PetscCallThrust(thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X), fs->cpermIndices->begin()), 1516 thrust::make_permutation_iterator(thrust::device_pointer_cast(fs->X + m), fs->cpermIndices->end()), xGPU)); 1517 } 1518 1519 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1520 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1521 PetscCall(PetscLogGpuTimeEnd()); 1522 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - A->rmap->n)); 1523 PetscFunctionReturn(PETSC_SUCCESS); 1524 } 1525 #else 1526 /* Why do we need to analyze the transposed matrix again? Can't we just use op(A) = CUSPARSE_OPERATION_TRANSPOSE in MatSolve_SeqAIJCUSPARSE? */ 1527 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1528 { 1529 PetscInt n = xx->map->n; 1530 const PetscScalar *barray; 1531 PetscScalar *xarray; 1532 thrust::device_ptr<const PetscScalar> bGPU; 1533 thrust::device_ptr<PetscScalar> xGPU; 1534 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1535 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1536 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1537 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1538 1539 PetscFunctionBegin; 1540 /* Analyze the matrix and create the transpose ... on the fly */ 1541 if (!loTriFactorT && !upTriFactorT) { 1542 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1543 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1544 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1545 } 1546 1547 /* Get the GPU pointers */ 1548 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1549 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1550 xGPU = thrust::device_pointer_cast(xarray); 1551 bGPU = thrust::device_pointer_cast(barray); 1552 1553 PetscCall(PetscLogGpuTimeBegin()); 1554 /* First, reorder with the row permutation */ 1555 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU + n, cusparseTriFactors->rpermIndices->end()), xGPU); 1556 1557 /* First, solve U */ 1558 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1559 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, xarray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1560 1561 /* Then, solve L */ 1562 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1563 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1564 1565 /* Last, copy the solution, xGPU, into a temporary with the column permutation ... can't be done in place. */ 1566 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(xGPU, cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(xGPU + n, cusparseTriFactors->cpermIndices->end()), tempGPU->begin()); 1567 1568 /* Copy the temporary to the full solution. */ 1569 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), tempGPU->begin(), tempGPU->end(), xGPU); 1570 1571 /* restore */ 1572 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1573 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1574 PetscCall(PetscLogGpuTimeEnd()); 1575 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1576 PetscFunctionReturn(PETSC_SUCCESS); 1577 } 1578 1579 static PetscErrorCode MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1580 { 1581 const PetscScalar *barray; 1582 PetscScalar *xarray; 1583 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1584 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1585 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1586 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1587 1588 PetscFunctionBegin; 1589 /* Analyze the matrix and create the transpose ... on the fly */ 1590 if (!loTriFactorT && !upTriFactorT) { 1591 PetscCall(MatSeqAIJCUSPARSEAnalyzeTransposeForSolve(A)); 1592 loTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtrTranspose; 1593 upTriFactorT = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtrTranspose; 1594 } 1595 1596 /* Get the GPU pointers */ 1597 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1598 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1599 1600 PetscCall(PetscLogGpuTimeBegin()); 1601 /* First, solve U */ 1602 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactorT->solveOp, upTriFactorT->csrMat->num_rows, upTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactorT->descr, upTriFactorT->csrMat->values->data().get(), 1603 upTriFactorT->csrMat->row_offsets->data().get(), upTriFactorT->csrMat->column_indices->data().get(), upTriFactorT->solveInfo, barray, tempGPU->data().get(), upTriFactorT->solvePolicy, upTriFactorT->solveBuffer)); 1604 1605 /* Then, solve L */ 1606 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactorT->solveOp, loTriFactorT->csrMat->num_rows, loTriFactorT->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactorT->descr, loTriFactorT->csrMat->values->data().get(), 1607 loTriFactorT->csrMat->row_offsets->data().get(), loTriFactorT->csrMat->column_indices->data().get(), loTriFactorT->solveInfo, tempGPU->data().get(), xarray, loTriFactorT->solvePolicy, loTriFactorT->solveBuffer)); 1608 1609 /* restore */ 1610 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1611 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1612 PetscCall(PetscLogGpuTimeEnd()); 1613 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1614 PetscFunctionReturn(PETSC_SUCCESS); 1615 } 1616 1617 static PetscErrorCode MatSolve_SeqAIJCUSPARSE(Mat A, Vec bb, Vec xx) 1618 { 1619 const PetscScalar *barray; 1620 PetscScalar *xarray; 1621 thrust::device_ptr<const PetscScalar> bGPU; 1622 thrust::device_ptr<PetscScalar> xGPU; 1623 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1624 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1625 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1626 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1627 1628 PetscFunctionBegin; 1629 /* Get the GPU pointers */ 1630 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1631 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1632 xGPU = thrust::device_pointer_cast(xarray); 1633 bGPU = thrust::device_pointer_cast(barray); 1634 1635 PetscCall(PetscLogGpuTimeBegin()); 1636 /* First, reorder with the row permutation */ 1637 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->begin()), thrust::make_permutation_iterator(bGPU, cusparseTriFactors->rpermIndices->end()), tempGPU->begin()); 1638 1639 /* Next, solve L */ 1640 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1641 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, tempGPU->data().get(), xarray, loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1642 1643 /* Then, solve U */ 1644 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1645 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, xarray, tempGPU->data().get(), upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1646 1647 /* Last, reorder with the column permutation */ 1648 thrust::copy(thrust::cuda::par.on(PetscDefaultCudaStream), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->begin()), thrust::make_permutation_iterator(tempGPU->begin(), cusparseTriFactors->cpermIndices->end()), xGPU); 1649 1650 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1651 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1652 PetscCall(PetscLogGpuTimeEnd()); 1653 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1654 PetscFunctionReturn(PETSC_SUCCESS); 1655 } 1656 1657 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_NaturalOrdering(Mat A, Vec bb, Vec xx) 1658 { 1659 const PetscScalar *barray; 1660 PetscScalar *xarray; 1661 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 1662 Mat_SeqAIJCUSPARSETriFactorStruct *loTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->loTriFactorPtr; 1663 Mat_SeqAIJCUSPARSETriFactorStruct *upTriFactor = (Mat_SeqAIJCUSPARSETriFactorStruct *)cusparseTriFactors->upTriFactorPtr; 1664 THRUSTARRAY *tempGPU = (THRUSTARRAY *)cusparseTriFactors->workVector; 1665 1666 PetscFunctionBegin; 1667 /* Get the GPU pointers */ 1668 PetscCall(VecCUDAGetArrayWrite(xx, &xarray)); 1669 PetscCall(VecCUDAGetArrayRead(bb, &barray)); 1670 1671 PetscCall(PetscLogGpuTimeBegin()); 1672 /* First, solve L */ 1673 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, loTriFactor->solveOp, loTriFactor->csrMat->num_rows, loTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, loTriFactor->descr, loTriFactor->csrMat->values->data().get(), 1674 loTriFactor->csrMat->row_offsets->data().get(), loTriFactor->csrMat->column_indices->data().get(), loTriFactor->solveInfo, barray, tempGPU->data().get(), loTriFactor->solvePolicy, loTriFactor->solveBuffer)); 1675 1676 /* Next, solve U */ 1677 PetscCallCUSPARSE(cusparseXcsrsv_solve(cusparseTriFactors->handle, upTriFactor->solveOp, upTriFactor->csrMat->num_rows, upTriFactor->csrMat->num_entries, &PETSC_CUSPARSE_ONE, upTriFactor->descr, upTriFactor->csrMat->values->data().get(), 1678 upTriFactor->csrMat->row_offsets->data().get(), upTriFactor->csrMat->column_indices->data().get(), upTriFactor->solveInfo, tempGPU->data().get(), xarray, upTriFactor->solvePolicy, upTriFactor->solveBuffer)); 1679 1680 PetscCall(VecCUDARestoreArrayRead(bb, &barray)); 1681 PetscCall(VecCUDARestoreArrayWrite(xx, &xarray)); 1682 PetscCall(PetscLogGpuTimeEnd()); 1683 PetscCall(PetscLogGpuFlops(2.0 * cusparseTriFactors->nnz - A->cmap->n)); 1684 PetscFunctionReturn(PETSC_SUCCESS); 1685 } 1686 #endif 1687 1688 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 1689 static PetscErrorCode MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, const MatFactorInfo *) 1690 { 1691 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1692 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1693 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1694 CsrMatrix *Acsr; 1695 PetscInt m, nz; 1696 PetscBool flg; 1697 1698 PetscFunctionBegin; 1699 if (PetscDefined(USE_DEBUG)) { 1700 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1701 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1702 } 1703 1704 /* Copy A's value to fact */ 1705 m = fact->rmap->n; 1706 nz = aij->nz; 1707 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1708 Acsr = (CsrMatrix *)Acusp->mat->mat; 1709 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1710 1711 PetscCall(PetscLogGpuTimeBegin()); 1712 /* Factorize fact inplace */ 1713 if (m) 1714 PetscCallCUSPARSE(cusparseXcsrilu02(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1715 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1716 if (PetscDefined(USE_DEBUG)) { 1717 int numerical_zero; 1718 cusparseStatus_t status; 1719 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &numerical_zero); 1720 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csrilu02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1721 } 1722 1723 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1724 if (fs->updatedSpSVAnalysis) { 1725 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1726 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_U, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1727 } else 1728 #endif 1729 { 1730 /* cusparseSpSV_analysis() is numeric, i.e., it requires valid matrix values, therefore, we do it after cusparseXcsrilu02() 1731 See discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/78 1732 */ 1733 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1734 1735 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, fs->spsvBuffer_U)); 1736 1737 fs->updatedSpSVAnalysis = PETSC_TRUE; 1738 /* L, U values have changed, reset the flag to indicate we need to redo cusparseSpSV_analysis() for transpose solve */ 1739 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 1740 } 1741 1742 fact->offloadmask = PETSC_OFFLOAD_GPU; 1743 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; // spMatDescr_L/U uses 32-bit indices, but cusparseSpSV_solve() supports both 32 and 64. The info is encoded in cusparseSpMatDescr_t. 1744 fact->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 1745 fact->ops->matsolve = NULL; 1746 fact->ops->matsolvetranspose = NULL; 1747 PetscCall(PetscLogGpuTimeEnd()); 1748 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1749 PetscFunctionReturn(PETSC_SUCCESS); 1750 } 1751 1752 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(Mat fact, Mat A, IS, IS, const MatFactorInfo *info) 1753 { 1754 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1755 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1756 PetscInt m, nz; 1757 1758 PetscFunctionBegin; 1759 if (PetscDefined(USE_DEBUG)) { 1760 PetscInt i; 1761 PetscBool flg, missing; 1762 1763 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1764 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1765 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 1766 PetscCall(MatMissingDiagonal(A, &missing, &i)); 1767 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 1768 } 1769 1770 /* Free the old stale stuff */ 1771 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 1772 1773 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 1774 but they will not be used. Allocate them just for easy debugging. 1775 */ 1776 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 1777 1778 fact->offloadmask = PETSC_OFFLOAD_BOTH; 1779 fact->factortype = MAT_FACTOR_ILU; 1780 fact->info.factor_mallocs = 0; 1781 fact->info.fill_ratio_given = info->fill; 1782 fact->info.fill_ratio_needed = 1.0; 1783 1784 aij->row = NULL; 1785 aij->col = NULL; 1786 1787 /* ====================================================================== */ 1788 /* Copy A's i, j to fact and also allocate the value array of fact. */ 1789 /* We'll do in-place factorization on fact */ 1790 /* ====================================================================== */ 1791 const int *Ai, *Aj; 1792 1793 m = fact->rmap->n; 1794 nz = aij->nz; 1795 1796 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 1797 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 1798 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(*fs->csrVal) * nz)); 1799 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai. The returned Ai, Aj are 32-bit */ 1800 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1801 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1802 1803 /* ====================================================================== */ 1804 /* Create descriptors for M, L, U */ 1805 /* ====================================================================== */ 1806 cusparseFillMode_t fillMode; 1807 cusparseDiagType_t diagType; 1808 1809 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 1810 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 1811 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 1812 1813 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 1814 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 1815 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 1816 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 1817 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 1818 */ 1819 fillMode = CUSPARSE_FILL_MODE_LOWER; 1820 diagType = CUSPARSE_DIAG_TYPE_UNIT; 1821 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1822 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1823 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1824 1825 fillMode = CUSPARSE_FILL_MODE_UPPER; 1826 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 1827 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_U, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 1828 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 1829 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_U, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 1830 1831 /* ========================================================================= */ 1832 /* Query buffer sizes for csrilu0, SpSV and allocate buffers */ 1833 /* ========================================================================= */ 1834 PetscCallCUSPARSE(cusparseCreateCsrilu02Info(&fs->ilu0Info_M)); 1835 if (m) 1836 PetscCallCUSPARSE(cusparseXcsrilu02_bufferSize(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1837 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, &fs->factBufferSize_M)); 1838 1839 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 1840 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 1841 1842 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 1843 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 1844 1845 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 1846 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 1847 1848 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_U)); 1849 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_U, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_U, &fs->spsvBufferSize_U)); 1850 1851 /* From my experiment with the example at https://github.com/NVIDIA/CUDALibrarySamples/tree/master/cuSPARSE/bicgstab, 1852 and discussion at https://github.com/NVIDIA/CUDALibrarySamples/issues/77, 1853 spsvBuffer_L/U can not be shared (i.e., the same) for our case, but factBuffer_M can share with either of spsvBuffer_L/U. 1854 To save memory, we make factBuffer_M share with the bigger of spsvBuffer_L/U. 1855 */ 1856 if (fs->spsvBufferSize_L > fs->spsvBufferSize_U) { 1857 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 1858 fs->spsvBuffer_L = fs->factBuffer_M; 1859 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_U, fs->spsvBufferSize_U)); 1860 } else { 1861 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_U, (size_t)fs->factBufferSize_M))); 1862 fs->spsvBuffer_U = fs->factBuffer_M; 1863 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 1864 } 1865 1866 /* ========================================================================== */ 1867 /* Perform analysis of ilu0 on M, SpSv on L and U */ 1868 /* The lower(upper) triangular part of M has the same sparsity pattern as L(U)*/ 1869 /* ========================================================================== */ 1870 int structural_zero; 1871 cusparseStatus_t status; 1872 1873 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 1874 if (m) 1875 PetscCallCUSPARSE(cusparseXcsrilu02_analysis(fs->handle, m, nz, /* cusparseXcsrilu02 errors out with empty matrices (m=0) */ 1876 fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ilu0Info_M, fs->policy_M, fs->factBuffer_M)); 1877 if (PetscDefined(USE_DEBUG)) { 1878 /* cusparseXcsrilu02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 1879 status = cusparseXcsrilu02_zeroPivot(fs->handle, fs->ilu0Info_M, &structural_zero); 1880 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csrilu02: A(%d,%d) is missing", structural_zero, structural_zero); 1881 } 1882 1883 /* Estimate FLOPs of the numeric factorization */ 1884 { 1885 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 1886 PetscInt *Ai, *Adiag, nzRow, nzLeft; 1887 PetscLogDouble flops = 0.0; 1888 1889 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1890 Ai = Aseq->i; 1891 Adiag = Aseq->diag; 1892 for (PetscInt i = 0; i < m; i++) { 1893 if (Ai[i] < Adiag[i] && Adiag[i] < Ai[i + 1]) { /* There are nonzeros left to the diagonal of row i */ 1894 nzRow = Ai[i + 1] - Ai[i]; 1895 nzLeft = Adiag[i] - Ai[i]; 1896 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 1897 and include the eliminated one will be updated, which incurs a multiplication and an addition. 1898 */ 1899 nzLeft = (nzRow - 1) / 2; 1900 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 1901 } 1902 } 1903 fs->numericFactFlops = flops; 1904 } 1905 fact->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJCUSPARSE_ILU0; 1906 PetscFunctionReturn(PETSC_SUCCESS); 1907 } 1908 1909 static PetscErrorCode MatSolve_SeqAIJCUSPARSE_ICC0(Mat fact, Vec b, Vec x) 1910 { 1911 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1912 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1913 const PetscScalar *barray; 1914 PetscScalar *xarray; 1915 1916 PetscFunctionBegin; 1917 PetscCall(VecCUDAGetArrayWrite(x, &xarray)); 1918 PetscCall(VecCUDAGetArrayRead(b, &barray)); 1919 PetscCall(PetscLogGpuTimeBegin()); 1920 1921 /* Solve L*y = b */ 1922 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, (void *)barray)); 1923 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_Y, fs->Y)); 1924 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* L Y = X */ 1925 fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L)); 1926 1927 /* Solve Lt*x = y */ 1928 PetscCallCUSPARSE(cusparseDnVecSetValues(fs->dnVecDescr_X, xarray)); 1929 PetscCallCUSPARSE(cusparseSpSV_solve(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, /* Lt X = Y */ 1930 fs->dnVecDescr_Y, fs->dnVecDescr_X, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt)); 1931 1932 PetscCall(VecCUDARestoreArrayRead(b, &barray)); 1933 PetscCall(VecCUDARestoreArrayWrite(x, &xarray)); 1934 1935 PetscCall(PetscLogGpuTimeEnd()); 1936 PetscCall(PetscLogGpuFlops(2.0 * aij->nz - fact->rmap->n)); 1937 PetscFunctionReturn(PETSC_SUCCESS); 1938 } 1939 1940 static PetscErrorCode MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, const MatFactorInfo *) 1941 { 1942 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 1943 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 1944 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 1945 CsrMatrix *Acsr; 1946 PetscInt m, nz; 1947 PetscBool flg; 1948 1949 PetscFunctionBegin; 1950 if (PetscDefined(USE_DEBUG)) { 1951 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 1952 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 1953 } 1954 1955 /* Copy A's value to fact */ 1956 m = fact->rmap->n; 1957 nz = aij->nz; 1958 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 1959 Acsr = (CsrMatrix *)Acusp->mat->mat; 1960 PetscCallCUDA(cudaMemcpyAsync(fs->csrVal, Acsr->values->data().get(), sizeof(PetscScalar) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 1961 1962 /* Factorize fact inplace */ 1963 /* https://docs.nvidia.com/cuda/cusparse/index.html#csric02_solve 1964 csric02() only takes the lower triangular part of matrix A to perform factorization. 1965 The matrix type must be CUSPARSE_MATRIX_TYPE_GENERAL, the fill mode and diagonal type are ignored, 1966 and the strictly upper triangular part is ignored and never touched. It does not matter if A is Hermitian or not. 1967 In other words, from the point of view of csric02() A is Hermitian and only the lower triangular part is provided. 1968 */ 1969 if (m) PetscCallCUSPARSE(cusparseXcsric02(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 1970 if (PetscDefined(USE_DEBUG)) { 1971 int numerical_zero; 1972 cusparseStatus_t status; 1973 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &numerical_zero); 1974 PetscAssert(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Numerical zero pivot detected in csric02: A(%d,%d) is zero", numerical_zero, numerical_zero); 1975 } 1976 1977 #if PETSC_PKG_CUDA_VERSION_GE(12, 1, 1) 1978 if (fs->updatedSpSVAnalysis) { 1979 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_L, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1980 if (fs->csrVal) PetscCallCUSPARSE(cusparseSpSV_updateMatrix(fs->handle, fs->spsvDescr_Lt, fs->csrVal, CUSPARSE_SPSV_UPDATE_GENERAL)); 1981 } else 1982 #endif 1983 { 1984 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, fs->spsvBuffer_L)); 1985 1986 /* Note that cusparse reports this error if we use double and CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE 1987 ** On entry to cusparseSpSV_analysis(): conjugate transpose (opA) is not supported for matA data type, current -> CUDA_R_64F 1988 */ 1989 PetscCallCUSPARSE(cusparseSpSV_analysis(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, fs->spsvBuffer_Lt)); 1990 fs->updatedSpSVAnalysis = PETSC_TRUE; 1991 } 1992 1993 fact->offloadmask = PETSC_OFFLOAD_GPU; 1994 fact->ops->solve = MatSolve_SeqAIJCUSPARSE_ICC0; 1995 fact->ops->solvetranspose = MatSolve_SeqAIJCUSPARSE_ICC0; 1996 fact->ops->matsolve = NULL; 1997 fact->ops->matsolvetranspose = NULL; 1998 PetscCall(PetscLogGpuFlops(fs->numericFactFlops)); 1999 PetscFunctionReturn(PETSC_SUCCESS); 2000 } 2001 2002 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(Mat fact, Mat A, IS, const MatFactorInfo *info) 2003 { 2004 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)fact->spptr; 2005 Mat_SeqAIJ *aij = (Mat_SeqAIJ *)fact->data; 2006 PetscInt m, nz; 2007 2008 PetscFunctionBegin; 2009 if (PetscDefined(USE_DEBUG)) { 2010 PetscInt i; 2011 PetscBool flg, missing; 2012 2013 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2014 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Expected MATSEQAIJCUSPARSE, but input is %s", ((PetscObject)A)->type_name); 2015 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2016 PetscCall(MatMissingDiagonal(A, &missing, &i)); 2017 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 2018 } 2019 2020 /* Free the old stale stuff */ 2021 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&fs)); 2022 2023 /* Copy over A's meta data to fact. Note that we also allocated fact's i,j,a on host, 2024 but they will not be used. Allocate them just for easy debugging. 2025 */ 2026 PetscCall(MatDuplicateNoCreate_SeqAIJ(fact, A, MAT_DO_NOT_COPY_VALUES, PETSC_TRUE /*malloc*/)); 2027 2028 fact->offloadmask = PETSC_OFFLOAD_BOTH; 2029 fact->factortype = MAT_FACTOR_ICC; 2030 fact->info.factor_mallocs = 0; 2031 fact->info.fill_ratio_given = info->fill; 2032 fact->info.fill_ratio_needed = 1.0; 2033 2034 aij->row = NULL; 2035 aij->col = NULL; 2036 2037 /* ====================================================================== */ 2038 /* Copy A's i, j to fact and also allocate the value array of fact. */ 2039 /* We'll do in-place factorization on fact */ 2040 /* ====================================================================== */ 2041 const int *Ai, *Aj; 2042 2043 m = fact->rmap->n; 2044 nz = aij->nz; 2045 2046 PetscCallCUDA(cudaMalloc((void **)&fs->csrRowPtr32, sizeof(*fs->csrRowPtr32) * (m + 1))); 2047 PetscCallCUDA(cudaMalloc((void **)&fs->csrColIdx32, sizeof(*fs->csrColIdx32) * nz)); 2048 PetscCallCUDA(cudaMalloc((void **)&fs->csrVal, sizeof(PetscScalar) * nz)); 2049 PetscCall(MatSeqAIJCUSPARSEGetIJ(A, PETSC_FALSE, &Ai, &Aj)); /* Do not use compressed Ai */ 2050 PetscCallCUDA(cudaMemcpyAsync(fs->csrRowPtr32, Ai, sizeof(*Ai) * (m + 1), cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2051 PetscCallCUDA(cudaMemcpyAsync(fs->csrColIdx32, Aj, sizeof(*Aj) * nz, cudaMemcpyDeviceToDevice, PetscDefaultCudaStream)); 2052 2053 /* ====================================================================== */ 2054 /* Create mat descriptors for M, L */ 2055 /* ====================================================================== */ 2056 cusparseFillMode_t fillMode; 2057 cusparseDiagType_t diagType; 2058 2059 PetscCallCUSPARSE(cusparseCreateMatDescr(&fs->matDescr_M)); 2060 PetscCallCUSPARSE(cusparseSetMatIndexBase(fs->matDescr_M, CUSPARSE_INDEX_BASE_ZERO)); 2061 PetscCallCUSPARSE(cusparseSetMatType(fs->matDescr_M, CUSPARSE_MATRIX_TYPE_GENERAL)); 2062 2063 /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparseDiagType_t 2064 cusparseDiagType_t: This type indicates if the matrix diagonal entries are unity. The diagonal elements are always 2065 assumed to be present, but if CUSPARSE_DIAG_TYPE_UNIT is passed to an API routine, then the routine assumes that 2066 all diagonal entries are unity and will not read or modify those entries. Note that in this case the routine 2067 assumes the diagonal entries are equal to one, regardless of what those entries are actually set to in memory. 2068 */ 2069 fillMode = CUSPARSE_FILL_MODE_LOWER; 2070 diagType = CUSPARSE_DIAG_TYPE_NON_UNIT; 2071 PetscCallCUSPARSE(cusparseCreateCsr(&fs->spMatDescr_L, m, m, nz, fs->csrRowPtr32, fs->csrColIdx32, fs->csrVal, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 2072 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_FILL_MODE, &fillMode, sizeof(fillMode))); 2073 PetscCallCUSPARSE(cusparseSpMatSetAttribute(fs->spMatDescr_L, CUSPARSE_SPMAT_DIAG_TYPE, &diagType, sizeof(diagType))); 2074 2075 /* ========================================================================= */ 2076 /* Query buffer sizes for csric0, SpSV of L and Lt, and allocate buffers */ 2077 /* ========================================================================= */ 2078 PetscCallCUSPARSE(cusparseCreateCsric02Info(&fs->ic0Info_M)); 2079 if (m) PetscCallCUSPARSE(cusparseXcsric02_bufferSize(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, &fs->factBufferSize_M)); 2080 2081 PetscCallCUDA(cudaMalloc((void **)&fs->X, sizeof(PetscScalar) * m)); 2082 PetscCallCUDA(cudaMalloc((void **)&fs->Y, sizeof(PetscScalar) * m)); 2083 2084 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_X, m, fs->X, cusparse_scalartype)); 2085 PetscCallCUSPARSE(cusparseCreateDnVec(&fs->dnVecDescr_Y, m, fs->Y, cusparse_scalartype)); 2086 2087 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_L)); 2088 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_L, &fs->spsvBufferSize_L)); 2089 2090 PetscCallCUSPARSE(cusparseSpSV_createDescr(&fs->spsvDescr_Lt)); 2091 PetscCallCUSPARSE(cusparseSpSV_bufferSize(fs->handle, CUSPARSE_OPERATION_TRANSPOSE, &PETSC_CUSPARSE_ONE, fs->spMatDescr_L, fs->dnVecDescr_X, fs->dnVecDescr_Y, cusparse_scalartype, CUSPARSE_SPSV_ALG_DEFAULT, fs->spsvDescr_Lt, &fs->spsvBufferSize_Lt)); 2092 2093 /* To save device memory, we make the factorization buffer share with one of the solver buffer. 2094 See also comments in MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(). 2095 */ 2096 if (fs->spsvBufferSize_L > fs->spsvBufferSize_Lt) { 2097 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_L, (size_t)fs->factBufferSize_M))); 2098 fs->spsvBuffer_L = fs->factBuffer_M; 2099 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_Lt, fs->spsvBufferSize_Lt)); 2100 } else { 2101 PetscCallCUDA(cudaMalloc((void **)&fs->factBuffer_M, PetscMax(fs->spsvBufferSize_Lt, (size_t)fs->factBufferSize_M))); 2102 fs->spsvBuffer_Lt = fs->factBuffer_M; 2103 PetscCallCUDA(cudaMalloc((void **)&fs->spsvBuffer_L, fs->spsvBufferSize_L)); 2104 } 2105 2106 /* ========================================================================== */ 2107 /* Perform analysis of ic0 on M */ 2108 /* The lower triangular part of M has the same sparsity pattern as L */ 2109 /* ========================================================================== */ 2110 int structural_zero; 2111 cusparseStatus_t status; 2112 2113 fs->policy_M = CUSPARSE_SOLVE_POLICY_USE_LEVEL; 2114 if (m) PetscCallCUSPARSE(cusparseXcsric02_analysis(fs->handle, m, nz, fs->matDescr_M, fs->csrVal, fs->csrRowPtr32, fs->csrColIdx32, fs->ic0Info_M, fs->policy_M, fs->factBuffer_M)); 2115 if (PetscDefined(USE_DEBUG)) { 2116 /* cusparseXcsric02_zeroPivot() is a blocking call. It calls cudaDeviceSynchronize() to make sure all previous kernels are done. */ 2117 status = cusparseXcsric02_zeroPivot(fs->handle, fs->ic0Info_M, &structural_zero); 2118 PetscCheck(CUSPARSE_STATUS_ZERO_PIVOT != status, PETSC_COMM_SELF, PETSC_ERR_USER_INPUT, "Structural zero pivot detected in csric02: A(%d,%d) is missing", structural_zero, structural_zero); 2119 } 2120 2121 /* Estimate FLOPs of the numeric factorization */ 2122 { 2123 Mat_SeqAIJ *Aseq = (Mat_SeqAIJ *)A->data; 2124 PetscInt *Ai, nzRow, nzLeft; 2125 PetscLogDouble flops = 0.0; 2126 2127 Ai = Aseq->i; 2128 for (PetscInt i = 0; i < m; i++) { 2129 nzRow = Ai[i + 1] - Ai[i]; 2130 if (nzRow > 1) { 2131 /* We want to eliminate nonzeros left to the diagonal one by one. Assume each time, nonzeros right 2132 and include the eliminated one will be updated, which incurs a multiplication and an addition. 2133 */ 2134 nzLeft = (nzRow - 1) / 2; 2135 flops += nzLeft * (2.0 * nzRow - nzLeft + 1); 2136 } 2137 } 2138 fs->numericFactFlops = flops; 2139 } 2140 fact->ops->choleskyfactornumeric = MatICCFactorNumeric_SeqAIJCUSPARSE_ICC0; 2141 PetscFunctionReturn(PETSC_SUCCESS); 2142 } 2143 #endif 2144 2145 static PetscErrorCode MatLUFactorNumeric_SeqAIJCUSPARSE(Mat B, Mat A, const MatFactorInfo *info) 2146 { 2147 // use_cpu_solve is a field in Mat_SeqAIJCUSPARSE. B, a factored matrix, uses Mat_SeqAIJCUSPARSETriFactors. 2148 Mat_SeqAIJCUSPARSE *cusparsestruct = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2149 2150 PetscFunctionBegin; 2151 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2152 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 2153 B->offloadmask = PETSC_OFFLOAD_CPU; 2154 2155 if (!cusparsestruct->use_cpu_solve) { 2156 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2157 B->ops->solve = MatSolve_SeqAIJCUSPARSE_LU; 2158 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_LU; 2159 #else 2160 /* determine which version of MatSolve needs to be used. */ 2161 Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 2162 IS isrow = b->row, iscol = b->col; 2163 PetscBool row_identity, col_identity; 2164 2165 PetscCall(ISIdentity(isrow, &row_identity)); 2166 PetscCall(ISIdentity(iscol, &col_identity)); 2167 if (row_identity && col_identity) { 2168 B->ops->solve = MatSolve_SeqAIJCUSPARSE_NaturalOrdering; 2169 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE_NaturalOrdering; 2170 } else { 2171 B->ops->solve = MatSolve_SeqAIJCUSPARSE; 2172 B->ops->solvetranspose = MatSolveTranspose_SeqAIJCUSPARSE; 2173 } 2174 #endif 2175 } 2176 B->ops->matsolve = NULL; 2177 B->ops->matsolvetranspose = NULL; 2178 2179 /* get the triangular factors */ 2180 if (!cusparsestruct->use_cpu_solve) PetscCall(MatSeqAIJCUSPARSEILUAnalysisAndCopyToGPU(B)); 2181 PetscFunctionReturn(PETSC_SUCCESS); 2182 } 2183 2184 static PetscErrorCode MatLUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2185 { 2186 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = static_cast<Mat_SeqAIJCUSPARSETriFactors *>(B->spptr); 2187 2188 PetscFunctionBegin; 2189 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2190 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2191 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2192 PetscFunctionReturn(PETSC_SUCCESS); 2193 } 2194 2195 static PetscErrorCode MatILUFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2196 { 2197 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2198 2199 PetscFunctionBegin; 2200 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2201 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2202 if (!info->factoronhost) { 2203 PetscCall(ISIdentity(isrow, &row_identity)); 2204 PetscCall(ISIdentity(iscol, &col_identity)); 2205 } 2206 if (!info->levels && row_identity && col_identity) { 2207 PetscCall(MatILUFactorSymbolic_SeqAIJCUSPARSE_ILU0(B, A, isrow, iscol, info)); 2208 } else 2209 #endif 2210 { 2211 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2212 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2213 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJCUSPARSE; 2214 } 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217 2218 static PetscErrorCode MatICCFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2219 { 2220 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2221 2222 PetscFunctionBegin; 2223 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2224 PetscBool perm_identity = PETSC_FALSE; 2225 if (!info->factoronhost) PetscCall(ISIdentity(perm, &perm_identity)); 2226 if (!info->levels && perm_identity) { 2227 PetscCall(MatICCFactorSymbolic_SeqAIJCUSPARSE_ICC0(B, A, perm, info)); 2228 } else 2229 #endif 2230 { 2231 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2232 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2233 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2234 } 2235 PetscFunctionReturn(PETSC_SUCCESS); 2236 } 2237 2238 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJCUSPARSE(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2239 { 2240 Mat_SeqAIJCUSPARSETriFactors *cusparseTriFactors = (Mat_SeqAIJCUSPARSETriFactors *)B->spptr; 2241 2242 PetscFunctionBegin; 2243 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(&cusparseTriFactors)); 2244 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); 2245 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJCUSPARSE; 2246 PetscFunctionReturn(PETSC_SUCCESS); 2247 } 2248 2249 static PetscErrorCode MatFactorGetSolverType_seqaij_cusparse(Mat, MatSolverType *type) 2250 { 2251 PetscFunctionBegin; 2252 *type = MATSOLVERCUSPARSE; 2253 PetscFunctionReturn(PETSC_SUCCESS); 2254 } 2255 2256 /*MC 2257 MATSOLVERCUSPARSE = "cusparse" - A matrix type providing triangular solvers for seq matrices 2258 on a single GPU of type, `MATSEQAIJCUSPARSE`. Currently supported 2259 algorithms are ILU(k) and ICC(k). Typically, deeper factorizations (larger k) results in poorer 2260 performance in the triangular solves. Full LU, and Cholesky decompositions can be solved through the 2261 CuSPARSE triangular solve algorithm. However, the performance can be quite poor and thus these 2262 algorithms are not recommended. This class does NOT support direct solver operations. 2263 2264 Level: beginner 2265 2266 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJCUSPARSE()`, 2267 `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 2268 M*/ 2269 2270 PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijcusparse_cusparse(Mat A, MatFactorType ftype, Mat *B) 2271 { 2272 PetscInt n = A->rmap->n; 2273 2274 PetscFunctionBegin; 2275 PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 2276 PetscCall(MatSetSizes(*B, n, n, n, n)); 2277 (*B)->factortype = ftype; // factortype makes MatSetType() allocate spptr of type Mat_SeqAIJCUSPARSETriFactors 2278 PetscCall(MatSetType(*B, MATSEQAIJCUSPARSE)); 2279 2280 if (A->boundtocpu && A->bindingpropagates) PetscCall(MatBindToCPU(*B, PETSC_TRUE)); 2281 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2282 PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2283 if (!A->boundtocpu) { 2284 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJCUSPARSE; 2285 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJCUSPARSE; 2286 } else { 2287 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJ; 2288 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJ; 2289 } 2290 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2291 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2292 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2293 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2294 if (!A->boundtocpu) { 2295 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJCUSPARSE; 2296 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJCUSPARSE; 2297 } else { 2298 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJ; 2299 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJ; 2300 } 2301 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2302 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2303 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for CUSPARSE Matrix Types"); 2304 2305 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2306 (*B)->canuseordering = PETSC_TRUE; 2307 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_cusparse)); 2308 PetscFunctionReturn(PETSC_SUCCESS); 2309 } 2310 2311 static PetscErrorCode MatSeqAIJCUSPARSECopyFromGPU(Mat A) 2312 { 2313 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2314 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2315 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2316 Mat_SeqAIJCUSPARSETriFactors *fs = (Mat_SeqAIJCUSPARSETriFactors *)A->spptr; 2317 #endif 2318 2319 PetscFunctionBegin; 2320 if (A->offloadmask == PETSC_OFFLOAD_GPU) { 2321 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2322 if (A->factortype == MAT_FACTOR_NONE) { 2323 CsrMatrix *matrix = (CsrMatrix *)cusp->mat->mat; 2324 PetscCallCUDA(cudaMemcpy(a->a, matrix->values->data().get(), a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2325 } 2326 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2327 else if (fs->csrVal) { 2328 /* We have a factorized matrix on device and are able to copy it to host */ 2329 PetscCallCUDA(cudaMemcpy(a->a, fs->csrVal, a->nz * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 2330 } 2331 #endif 2332 else 2333 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for copying this type of factorized matrix from device to host"); 2334 PetscCall(PetscLogGpuToCpu(a->nz * sizeof(PetscScalar))); 2335 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyFromGPU, A, 0, 0, 0)); 2336 A->offloadmask = PETSC_OFFLOAD_BOTH; 2337 } 2338 PetscFunctionReturn(PETSC_SUCCESS); 2339 } 2340 2341 static PetscErrorCode MatSeqAIJGetArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2342 { 2343 PetscFunctionBegin; 2344 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2345 *array = ((Mat_SeqAIJ *)A->data)->a; 2346 PetscFunctionReturn(PETSC_SUCCESS); 2347 } 2348 2349 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2350 { 2351 PetscFunctionBegin; 2352 A->offloadmask = PETSC_OFFLOAD_CPU; 2353 *array = NULL; 2354 PetscFunctionReturn(PETSC_SUCCESS); 2355 } 2356 2357 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJCUSPARSE(Mat A, const PetscScalar *array[]) 2358 { 2359 PetscFunctionBegin; 2360 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 2361 *array = ((Mat_SeqAIJ *)A->data)->a; 2362 PetscFunctionReturn(PETSC_SUCCESS); 2363 } 2364 2365 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE(Mat, const PetscScalar *array[]) 2366 { 2367 PetscFunctionBegin; 2368 *array = NULL; 2369 PetscFunctionReturn(PETSC_SUCCESS); 2370 } 2371 2372 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2373 { 2374 PetscFunctionBegin; 2375 *array = ((Mat_SeqAIJ *)A->data)->a; 2376 PetscFunctionReturn(PETSC_SUCCESS); 2377 } 2378 2379 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE(Mat A, PetscScalar *array[]) 2380 { 2381 PetscFunctionBegin; 2382 A->offloadmask = PETSC_OFFLOAD_CPU; 2383 *array = NULL; 2384 PetscFunctionReturn(PETSC_SUCCESS); 2385 } 2386 2387 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 2388 { 2389 Mat_SeqAIJCUSPARSE *cusp; 2390 CsrMatrix *matrix; 2391 2392 PetscFunctionBegin; 2393 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2394 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "Not for factored matrix"); 2395 cusp = static_cast<Mat_SeqAIJCUSPARSE *>(A->spptr); 2396 PetscCheck(cusp != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "cusp is NULL"); 2397 matrix = (CsrMatrix *)cusp->mat->mat; 2398 2399 if (i) { 2400 #if !defined(PETSC_USE_64BIT_INDICES) 2401 *i = matrix->row_offsets->data().get(); 2402 #else 2403 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2404 #endif 2405 } 2406 if (j) { 2407 #if !defined(PETSC_USE_64BIT_INDICES) 2408 *j = matrix->column_indices->data().get(); 2409 #else 2410 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSparse does not supported 64-bit indices"); 2411 #endif 2412 } 2413 if (a) *a = matrix->values->data().get(); 2414 if (mtype) *mtype = PETSC_MEMTYPE_CUDA; 2415 PetscFunctionReturn(PETSC_SUCCESS); 2416 } 2417 2418 PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat A) 2419 { 2420 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 2421 Mat_SeqAIJCUSPARSEMultStruct *matstruct = cusparsestruct->mat; 2422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2423 PetscInt m = A->rmap->n, *ii, *ridx, tmp; 2424 cusparseStatus_t stat; 2425 PetscBool both = PETSC_TRUE; 2426 2427 PetscFunctionBegin; 2428 PetscCheck(!A->boundtocpu, PETSC_COMM_SELF, PETSC_ERR_GPU, "Cannot copy to GPU"); 2429 if (A->offloadmask == PETSC_OFFLOAD_UNALLOCATED || A->offloadmask == PETSC_OFFLOAD_CPU) { 2430 if (A->nonzerostate == cusparsestruct->nonzerostate && cusparsestruct->format == MAT_CUSPARSE_CSR) { /* Copy values only */ 2431 CsrMatrix *matrix; 2432 matrix = (CsrMatrix *)cusparsestruct->mat->mat; 2433 2434 PetscCheck(!a->nz || a->a, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR values"); 2435 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2436 matrix->values->assign(a->a, a->a + a->nz); 2437 PetscCallCUDA(WaitForCUDA()); 2438 PetscCall(PetscLogCpuToGpu(a->nz * sizeof(PetscScalar))); 2439 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2440 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 2441 } else { 2442 PetscInt nnz; 2443 PetscCall(PetscLogEventBegin(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2444 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusparsestruct->mat, cusparsestruct->format)); 2445 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_TRUE)); 2446 delete cusparsestruct->workVector; 2447 delete cusparsestruct->rowoffsets_gpu; 2448 cusparsestruct->workVector = NULL; 2449 cusparsestruct->rowoffsets_gpu = NULL; 2450 try { 2451 if (a->compressedrow.use) { 2452 m = a->compressedrow.nrows; 2453 ii = a->compressedrow.i; 2454 ridx = a->compressedrow.rindex; 2455 } else { 2456 m = A->rmap->n; 2457 ii = a->i; 2458 ridx = NULL; 2459 } 2460 PetscCheck(ii, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR row data"); 2461 if (!a->a) { 2462 nnz = ii[m]; 2463 both = PETSC_FALSE; 2464 } else nnz = a->nz; 2465 PetscCheck(!nnz || a->j, PETSC_COMM_SELF, PETSC_ERR_GPU, "Missing CSR column data"); 2466 2467 /* create cusparse matrix */ 2468 cusparsestruct->nrows = m; 2469 matstruct = new Mat_SeqAIJCUSPARSEMultStruct; 2470 PetscCallCUSPARSE(cusparseCreateMatDescr(&matstruct->descr)); 2471 PetscCallCUSPARSE(cusparseSetMatIndexBase(matstruct->descr, CUSPARSE_INDEX_BASE_ZERO)); 2472 PetscCallCUSPARSE(cusparseSetMatType(matstruct->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 2473 2474 PetscCallCUDA(cudaMalloc((void **)&matstruct->alpha_one, sizeof(PetscScalar))); 2475 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_zero, sizeof(PetscScalar))); 2476 PetscCallCUDA(cudaMalloc((void **)&matstruct->beta_one, sizeof(PetscScalar))); 2477 PetscCallCUDA(cudaMemcpy(matstruct->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2478 PetscCallCUDA(cudaMemcpy(matstruct->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2479 PetscCallCUDA(cudaMemcpy(matstruct->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 2480 PetscCallCUSPARSE(cusparseSetPointerMode(cusparsestruct->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2481 2482 /* Build a hybrid/ellpack matrix if this option is chosen for the storage */ 2483 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 2484 /* set the matrix */ 2485 CsrMatrix *mat = new CsrMatrix; 2486 mat->num_rows = m; 2487 mat->num_cols = A->cmap->n; 2488 mat->num_entries = nnz; 2489 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2490 mat->row_offsets->assign(ii, ii + m + 1); 2491 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2492 mat->column_indices->assign(a->j, a->j + nnz); 2493 2494 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2495 if (a->a) mat->values->assign(a->a, a->a + nnz); 2496 2497 /* assign the pointer */ 2498 matstruct->mat = mat; 2499 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2500 if (mat->num_rows) { /* cusparse errors on empty matrices! */ 2501 stat = cusparseCreateCsr(&matstruct->matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2502 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2503 PetscCallCUSPARSE(stat); 2504 } 2505 #endif 2506 } else if (cusparsestruct->format == MAT_CUSPARSE_ELL || cusparsestruct->format == MAT_CUSPARSE_HYB) { 2507 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2508 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 2509 #else 2510 CsrMatrix *mat = new CsrMatrix; 2511 mat->num_rows = m; 2512 mat->num_cols = A->cmap->n; 2513 mat->num_entries = nnz; 2514 PetscCallCXX(mat->row_offsets = new THRUSTINTARRAY32(m + 1)); 2515 mat->row_offsets->assign(ii, ii + m + 1); 2516 2517 PetscCallCXX(mat->column_indices = new THRUSTINTARRAY32(nnz)); 2518 mat->column_indices->assign(a->j, a->j + nnz); 2519 2520 PetscCallCXX(mat->values = new THRUSTARRAY(nnz)); 2521 if (a->a) mat->values->assign(a->a, a->a + nnz); 2522 2523 cusparseHybMat_t hybMat; 2524 PetscCallCUSPARSE(cusparseCreateHybMat(&hybMat)); 2525 cusparseHybPartition_t partition = cusparsestruct->format == MAT_CUSPARSE_ELL ? CUSPARSE_HYB_PARTITION_MAX : CUSPARSE_HYB_PARTITION_AUTO; 2526 stat = cusparse_csr2hyb(cusparsestruct->handle, mat->num_rows, mat->num_cols, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), hybMat, 0, partition); 2527 PetscCallCUSPARSE(stat); 2528 /* assign the pointer */ 2529 matstruct->mat = hybMat; 2530 2531 if (mat) { 2532 if (mat->values) delete (THRUSTARRAY *)mat->values; 2533 if (mat->column_indices) delete (THRUSTINTARRAY32 *)mat->column_indices; 2534 if (mat->row_offsets) delete (THRUSTINTARRAY32 *)mat->row_offsets; 2535 delete (CsrMatrix *)mat; 2536 } 2537 #endif 2538 } 2539 2540 /* assign the compressed row indices */ 2541 if (a->compressedrow.use) { 2542 PetscCallCXX(cusparsestruct->workVector = new THRUSTARRAY(m)); 2543 PetscCallCXX(matstruct->cprowIndices = new THRUSTINTARRAY(m)); 2544 matstruct->cprowIndices->assign(ridx, ridx + m); 2545 tmp = m; 2546 } else { 2547 cusparsestruct->workVector = NULL; 2548 matstruct->cprowIndices = NULL; 2549 tmp = 0; 2550 } 2551 PetscCall(PetscLogCpuToGpu(((m + 1) + (a->nz)) * sizeof(int) + tmp * sizeof(PetscInt) + (3 + (a->nz)) * sizeof(PetscScalar))); 2552 2553 /* assign the pointer */ 2554 cusparsestruct->mat = matstruct; 2555 } catch (char *ex) { 2556 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 2557 } 2558 PetscCallCUDA(WaitForCUDA()); 2559 PetscCall(PetscLogEventEnd(MAT_CUSPARSECopyToGPU, A, 0, 0, 0)); 2560 cusparsestruct->nonzerostate = A->nonzerostate; 2561 } 2562 if (both) A->offloadmask = PETSC_OFFLOAD_BOTH; 2563 } 2564 PetscFunctionReturn(PETSC_SUCCESS); 2565 } 2566 2567 struct VecCUDAPlusEquals { 2568 template <typename Tuple> 2569 __host__ __device__ void operator()(Tuple t) 2570 { 2571 thrust::get<1>(t) = thrust::get<1>(t) + thrust::get<0>(t); 2572 } 2573 }; 2574 2575 struct VecCUDAEquals { 2576 template <typename Tuple> 2577 __host__ __device__ void operator()(Tuple t) 2578 { 2579 thrust::get<1>(t) = thrust::get<0>(t); 2580 } 2581 }; 2582 2583 struct VecCUDAEqualsReverse { 2584 template <typename Tuple> 2585 __host__ __device__ void operator()(Tuple t) 2586 { 2587 thrust::get<0>(t) = thrust::get<1>(t); 2588 } 2589 }; 2590 2591 struct MatProductCtx_MatMatCusparse { 2592 PetscBool cisdense; 2593 PetscScalar *Bt; 2594 Mat X; 2595 PetscBool reusesym; /* Cusparse does not have split symbolic and numeric phases for sparse matmat operations */ 2596 PetscLogDouble flops; 2597 CsrMatrix *Bcsr; 2598 2599 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2600 cusparseSpMatDescr_t matSpBDescr; 2601 PetscBool initialized; /* C = alpha op(A) op(B) + beta C */ 2602 cusparseDnMatDescr_t matBDescr; 2603 cusparseDnMatDescr_t matCDescr; 2604 PetscInt Blda, Clda; /* Record leading dimensions of B and C here to detect changes*/ 2605 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2606 void *dBuffer4; 2607 void *dBuffer5; 2608 #endif 2609 size_t mmBufferSize; 2610 void *mmBuffer; 2611 void *mmBuffer2; /* SpGEMM WorkEstimation buffer */ 2612 cusparseSpGEMMDescr_t spgemmDesc; 2613 #endif 2614 }; 2615 2616 static PetscErrorCode MatProductCtxDestroy_MatMatCusparse(void **data) 2617 { 2618 MatProductCtx_MatMatCusparse *mmdata = *(MatProductCtx_MatMatCusparse **)data; 2619 2620 PetscFunctionBegin; 2621 PetscCallCUDA(cudaFree(mmdata->Bt)); 2622 delete mmdata->Bcsr; 2623 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2624 if (mmdata->matSpBDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mmdata->matSpBDescr)); 2625 if (mmdata->matBDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2626 if (mmdata->matCDescr) PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2627 if (mmdata->spgemmDesc) PetscCallCUSPARSE(cusparseSpGEMM_destroyDescr(mmdata->spgemmDesc)); 2628 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2629 if (mmdata->dBuffer4) PetscCallCUDA(cudaFree(mmdata->dBuffer4)); 2630 if (mmdata->dBuffer5) PetscCallCUDA(cudaFree(mmdata->dBuffer5)); 2631 #endif 2632 if (mmdata->mmBuffer) PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2633 if (mmdata->mmBuffer2) PetscCallCUDA(cudaFree(mmdata->mmBuffer2)); 2634 #endif 2635 PetscCall(MatDestroy(&mmdata->X)); 2636 PetscCall(PetscFree(*data)); 2637 PetscFunctionReturn(PETSC_SUCCESS); 2638 } 2639 2640 #include <../src/mat/impls/dense/seq/dense.h> // MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal() 2641 2642 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2643 { 2644 Mat_Product *product = C->product; 2645 Mat A, B; 2646 PetscInt m, n, blda, clda; 2647 PetscBool flg, biscuda; 2648 Mat_SeqAIJCUSPARSE *cusp; 2649 cusparseStatus_t stat; 2650 cusparseOperation_t opA; 2651 const PetscScalar *barray; 2652 PetscScalar *carray; 2653 MatProductCtx_MatMatCusparse *mmdata; 2654 Mat_SeqAIJCUSPARSEMultStruct *mat; 2655 CsrMatrix *csrmat; 2656 2657 PetscFunctionBegin; 2658 MatCheckProduct(C, 1); 2659 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2660 mmdata = (MatProductCtx_MatMatCusparse *)product->data; 2661 A = product->A; 2662 B = product->B; 2663 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2664 PetscCheck(flg, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2665 /* currently CopyToGpu does not copy if the matrix is bound to CPU 2666 Instead of silently accepting the wrong answer, I prefer to raise the error */ 2667 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2668 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2669 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2670 switch (product->type) { 2671 case MATPRODUCT_AB: 2672 case MATPRODUCT_PtAP: 2673 mat = cusp->mat; 2674 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2675 m = A->rmap->n; 2676 n = B->cmap->n; 2677 break; 2678 case MATPRODUCT_AtB: 2679 if (!A->form_explicit_transpose) { 2680 mat = cusp->mat; 2681 opA = CUSPARSE_OPERATION_TRANSPOSE; 2682 } else { 2683 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 2684 mat = cusp->matTranspose; 2685 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2686 } 2687 m = A->cmap->n; 2688 n = B->cmap->n; 2689 break; 2690 case MATPRODUCT_ABt: 2691 case MATPRODUCT_RARt: 2692 mat = cusp->mat; 2693 opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 2694 m = A->rmap->n; 2695 n = B->rmap->n; 2696 break; 2697 default: 2698 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2699 } 2700 PetscCheck(mat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 2701 csrmat = (CsrMatrix *)mat->mat; 2702 /* if the user passed a CPU matrix, copy the data to the GPU */ 2703 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSECUDA, &biscuda)); 2704 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSECUDA, MAT_INPLACE_MATRIX, &B)); 2705 PetscCall(MatDenseGetArrayReadAndMemType(B, &barray, nullptr)); 2706 2707 PetscCall(MatDenseGetLDA(B, &blda)); 2708 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2709 PetscCall(MatDenseGetArrayWriteAndMemType(mmdata->X, &carray, nullptr)); 2710 PetscCall(MatDenseGetLDA(mmdata->X, &clda)); 2711 } else { 2712 PetscCall(MatDenseGetArrayWriteAndMemType(C, &carray, nullptr)); 2713 PetscCall(MatDenseGetLDA(C, &clda)); 2714 } 2715 2716 PetscCall(PetscLogGpuTimeBegin()); 2717 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2718 cusparseOperation_t opB = (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 2719 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 2720 cusparseSpMatDescr_t &matADescr = mat->matDescr_SpMM[opA]; 2721 #else 2722 cusparseSpMatDescr_t &matADescr = mat->matDescr; 2723 #endif 2724 2725 /* (re)allocate mmBuffer if not initialized or LDAs are different */ 2726 if (!mmdata->initialized || mmdata->Blda != blda || mmdata->Clda != clda) { 2727 size_t mmBufferSize; 2728 if (mmdata->initialized && mmdata->Blda != blda) { 2729 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matBDescr)); 2730 mmdata->matBDescr = NULL; 2731 } 2732 if (!mmdata->matBDescr) { 2733 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matBDescr, B->rmap->n, B->cmap->n, blda, (void *)barray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2734 mmdata->Blda = blda; 2735 } 2736 2737 if (mmdata->initialized && mmdata->Clda != clda) { 2738 PetscCallCUSPARSE(cusparseDestroyDnMat(mmdata->matCDescr)); 2739 mmdata->matCDescr = NULL; 2740 } 2741 if (!mmdata->matCDescr) { /* matCDescr is for C or mmdata->X */ 2742 PetscCallCUSPARSE(cusparseCreateDnMat(&mmdata->matCDescr, m, n, clda, (void *)carray, cusparse_scalartype, CUSPARSE_ORDER_COL)); 2743 mmdata->Clda = clda; 2744 } 2745 2746 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // tested up to 12.6.0 2747 if (matADescr) { 2748 PetscCallCUSPARSE(cusparseDestroySpMat(matADescr)); // Because I find I could not reuse matADescr. It could be a cusparse bug 2749 matADescr = NULL; 2750 } 2751 #endif 2752 2753 if (!matADescr) { 2754 stat = cusparseCreateCsr(&matADescr, csrmat->num_rows, csrmat->num_cols, csrmat->num_entries, csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), csrmat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, /* row offset, col idx types due to THRUSTINTARRAY32 */ 2755 CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 2756 PetscCallCUSPARSE(stat); 2757 } 2758 2759 PetscCallCUSPARSE(cusparseSpMM_bufferSize(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, &mmBufferSize)); 2760 2761 if ((mmdata->mmBuffer && mmdata->mmBufferSize < mmBufferSize) || !mmdata->mmBuffer) { 2762 PetscCallCUDA(cudaFree(mmdata->mmBuffer)); 2763 PetscCallCUDA(cudaMalloc(&mmdata->mmBuffer, mmBufferSize)); 2764 mmdata->mmBufferSize = mmBufferSize; 2765 } 2766 2767 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // the _preprocess was added in 11.2.1, but PETSc worked without it until 12.4.0 2768 PetscCallCUSPARSE(cusparseSpMM_preprocess(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2769 #endif 2770 2771 mmdata->initialized = PETSC_TRUE; 2772 } else { 2773 /* to be safe, always update pointers of the mats */ 2774 PetscCallCUSPARSE(cusparseSpMatSetValues(matADescr, csrmat->values->data().get())); 2775 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matBDescr, (void *)barray)); 2776 PetscCallCUSPARSE(cusparseDnMatSetValues(mmdata->matCDescr, (void *)carray)); 2777 } 2778 2779 /* do cusparseSpMM, which supports transpose on B */ 2780 PetscCallCUSPARSE(cusparseSpMM(cusp->handle, opA, opB, mat->alpha_one, matADescr, mmdata->matBDescr, mat->beta_zero, mmdata->matCDescr, cusparse_scalartype, cusp->spmmAlg, mmdata->mmBuffer)); 2781 #else 2782 PetscInt k; 2783 /* cusparseXcsrmm does not support transpose on B */ 2784 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) { 2785 cublasHandle_t cublasv2handle; 2786 cublasStatus_t cerr; 2787 2788 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 2789 cerr = cublasXgeam(cublasv2handle, CUBLAS_OP_T, CUBLAS_OP_T, B->cmap->n, B->rmap->n, &PETSC_CUSPARSE_ONE, barray, blda, &PETSC_CUSPARSE_ZERO, barray, blda, mmdata->Bt, B->cmap->n); 2790 PetscCallCUBLAS(cerr); 2791 blda = B->cmap->n; 2792 k = B->cmap->n; 2793 } else { 2794 k = B->rmap->n; 2795 } 2796 2797 /* perform the MatMat operation, op(A) is m x k, op(B) is k x n */ 2798 stat = cusparse_csr_spmm(cusp->handle, opA, m, n, k, csrmat->num_entries, mat->alpha_one, mat->descr, csrmat->values->data().get(), csrmat->row_offsets->data().get(), csrmat->column_indices->data().get(), mmdata->Bt ? mmdata->Bt : barray, blda, mat->beta_zero, carray, clda); 2799 PetscCallCUSPARSE(stat); 2800 #endif 2801 PetscCall(PetscLogGpuTimeEnd()); 2802 PetscCall(PetscLogGpuFlops(n * 2.0 * csrmat->num_entries)); 2803 PetscCall(MatDenseRestoreArrayReadAndMemType(B, &barray)); 2804 if (product->type == MATPRODUCT_RARt) { 2805 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2806 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_FALSE, PETSC_FALSE)); 2807 } else if (product->type == MATPRODUCT_PtAP) { 2808 PetscCall(MatDenseRestoreArrayWriteAndMemType(mmdata->X, &carray)); 2809 PetscCall(MatMatMultNumeric_SeqDenseCUDA_SeqDenseCUDA_Internal(B, mmdata->X, C, PETSC_TRUE, PETSC_FALSE)); 2810 } else { 2811 PetscCall(MatDenseRestoreArrayWriteAndMemType(C, &carray)); 2812 } 2813 if (mmdata->cisdense) PetscCall(MatConvert(C, MATSEQDENSE, MAT_INPLACE_MATRIX, &C)); 2814 if (!biscuda) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B)); 2815 PetscFunctionReturn(PETSC_SUCCESS); 2816 } 2817 2818 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA(Mat C) 2819 { 2820 Mat_Product *product = C->product; 2821 Mat A, B; 2822 PetscInt m, n; 2823 PetscBool cisdense, flg; 2824 MatProductCtx_MatMatCusparse *mmdata; 2825 Mat_SeqAIJCUSPARSE *cusp; 2826 2827 PetscFunctionBegin; 2828 MatCheckProduct(C, 1); 2829 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 2830 A = product->A; 2831 B = product->B; 2832 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2833 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2834 cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2835 PetscCheck(cusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2836 switch (product->type) { 2837 case MATPRODUCT_AB: 2838 m = A->rmap->n; 2839 n = B->cmap->n; 2840 PetscCall(MatSetBlockSizesFromMats(C, A, B)); 2841 break; 2842 case MATPRODUCT_AtB: 2843 m = A->cmap->n; 2844 n = B->cmap->n; 2845 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 2846 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2847 break; 2848 case MATPRODUCT_ABt: 2849 m = A->rmap->n; 2850 n = B->rmap->n; 2851 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 2852 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2853 break; 2854 case MATPRODUCT_PtAP: 2855 m = B->cmap->n; 2856 n = B->cmap->n; 2857 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->cmap->bs)); 2858 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 2859 break; 2860 case MATPRODUCT_RARt: 2861 m = B->rmap->n; 2862 n = B->rmap->n; 2863 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, B->rmap->bs)); 2864 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 2865 break; 2866 default: 2867 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2868 } 2869 PetscCall(MatSetSizes(C, m, n, m, n)); 2870 /* if C is of type MATSEQDENSE (CPU), perform the operation on the GPU and then copy on the CPU */ 2871 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQDENSE, &cisdense)); 2872 PetscCall(MatSetType(C, MATSEQDENSECUDA)); 2873 2874 /* product data */ 2875 PetscCall(PetscNew(&mmdata)); 2876 mmdata->cisdense = cisdense; 2877 #if PETSC_PKG_CUDA_VERSION_LT(11, 0, 0) 2878 /* cusparseXcsrmm does not support transpose on B, so we allocate buffer to store B^T */ 2879 if (product->type == MATPRODUCT_ABt || product->type == MATPRODUCT_RARt) PetscCallCUDA(cudaMalloc((void **)&mmdata->Bt, (size_t)B->rmap->n * (size_t)B->cmap->n * sizeof(PetscScalar))); 2880 #endif 2881 /* for these products we need intermediate storage */ 2882 if (product->type == MATPRODUCT_RARt || product->type == MATPRODUCT_PtAP) { 2883 PetscCall(MatCreate(PetscObjectComm((PetscObject)C), &mmdata->X)); 2884 PetscCall(MatSetType(mmdata->X, MATSEQDENSECUDA)); 2885 if (product->type == MATPRODUCT_RARt) { /* do not preallocate, since the first call to MatDenseCUDAGetArray will preallocate on the GPU for us */ 2886 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->rmap->n, A->rmap->n, B->rmap->n)); 2887 } else { 2888 PetscCall(MatSetSizes(mmdata->X, A->rmap->n, B->cmap->n, A->rmap->n, B->cmap->n)); 2889 } 2890 } 2891 C->product->data = mmdata; 2892 C->product->destroy = MatProductCtxDestroy_MatMatCusparse; 2893 2894 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqDENSECUDA; 2895 PetscFunctionReturn(PETSC_SUCCESS); 2896 } 2897 2898 static PetscErrorCode MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 2899 { 2900 Mat_Product *product = C->product; 2901 Mat A, B; 2902 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 2903 Mat_SeqAIJ *c = (Mat_SeqAIJ *)C->data; 2904 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 2905 CsrMatrix *Acsr, *Bcsr, *Ccsr; 2906 PetscBool flg; 2907 cusparseStatus_t stat; 2908 MatProductType ptype; 2909 MatProductCtx_MatMatCusparse *mmdata; 2910 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2911 cusparseSpMatDescr_t BmatSpDescr; 2912 #endif 2913 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 2914 2915 PetscFunctionBegin; 2916 MatCheckProduct(C, 1); 2917 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data empty"); 2918 PetscCall(PetscObjectTypeCompare((PetscObject)C, MATSEQAIJCUSPARSE, &flg)); 2919 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for C of type %s", ((PetscObject)C)->type_name); 2920 mmdata = (MatProductCtx_MatMatCusparse *)C->product->data; 2921 A = product->A; 2922 B = product->B; 2923 if (mmdata->reusesym) { /* this happens when api_user is true, meaning that the matrix values have been already computed in the MatProductSymbolic phase */ 2924 mmdata->reusesym = PETSC_FALSE; 2925 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2926 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2927 Cmat = Ccusp->mat; 2928 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[C->product->type]); 2929 Ccsr = (CsrMatrix *)Cmat->mat; 2930 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2931 goto finalize; 2932 } 2933 if (!c->nz) goto finalize; 2934 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 2935 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 2936 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 2937 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 2938 PetscCheck(!A->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2939 PetscCheck(!B->boundtocpu, PetscObjectComm((PetscObject)C), PETSC_ERR_ARG_WRONG, "Cannot bind to CPU a CUSPARSE matrix between MatProductSymbolic and MatProductNumeric phases"); 2940 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 2941 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 2942 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 2943 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2944 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2945 PetscCheck(Ccusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 2946 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 2947 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 2948 2949 ptype = product->type; 2950 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 2951 ptype = MATPRODUCT_AB; 2952 PetscCheck(product->symbolic_used_the_fact_A_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that A is symmetric"); 2953 } 2954 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 2955 ptype = MATPRODUCT_AB; 2956 PetscCheck(product->symbolic_used_the_fact_B_is_symmetric, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Symbolic should have been built using the fact that B is symmetric"); 2957 } 2958 switch (ptype) { 2959 case MATPRODUCT_AB: 2960 Amat = Acusp->mat; 2961 Bmat = Bcusp->mat; 2962 break; 2963 case MATPRODUCT_AtB: 2964 Amat = Acusp->matTranspose; 2965 Bmat = Bcusp->mat; 2966 break; 2967 case MATPRODUCT_ABt: 2968 Amat = Acusp->mat; 2969 Bmat = Bcusp->matTranspose; 2970 break; 2971 default: 2972 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 2973 } 2974 Cmat = Ccusp->mat; 2975 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 2976 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 2977 PetscCheck(Cmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C mult struct for product type %s", MatProductTypes[ptype]); 2978 Acsr = (CsrMatrix *)Amat->mat; 2979 Bcsr = mmdata->Bcsr ? mmdata->Bcsr : (CsrMatrix *)Bmat->mat; /* B may be in compressed row storage */ 2980 Ccsr = (CsrMatrix *)Cmat->mat; 2981 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 2982 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 2983 PetscCheck(Ccsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing C CSR struct"); 2984 PetscCall(PetscLogGpuTimeBegin()); 2985 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 2986 BmatSpDescr = mmdata->Bcsr ? mmdata->matSpBDescr : Bmat->matDescr; /* B may be in compressed row storage */ 2987 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 2988 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 2989 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2990 PetscCallCUSPARSE(stat); 2991 #else 2992 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 2993 PetscCallCUSPARSE(stat); 2994 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 2995 PetscCallCUSPARSE(stat); 2996 #endif 2997 #else 2998 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 2999 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3000 PetscCallCUSPARSE(stat); 3001 #endif 3002 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3003 PetscCallCUDA(WaitForCUDA()); 3004 PetscCall(PetscLogGpuTimeEnd()); 3005 C->offloadmask = PETSC_OFFLOAD_GPU; 3006 finalize: 3007 /* shorter version of MatAssemblyEnd_SeqAIJ */ 3008 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 3009 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 3010 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 3011 c->reallocs = 0; 3012 C->info.mallocs += 0; 3013 C->info.nz_unneeded = 0; 3014 C->assembled = C->was_assembled = PETSC_TRUE; 3015 C->num_ass++; 3016 PetscFunctionReturn(PETSC_SUCCESS); 3017 } 3018 3019 static PetscErrorCode MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE(Mat C) 3020 { 3021 Mat_Product *product = C->product; 3022 Mat A, B; 3023 Mat_SeqAIJCUSPARSE *Acusp, *Bcusp, *Ccusp; 3024 Mat_SeqAIJ *a, *b, *c; 3025 Mat_SeqAIJCUSPARSEMultStruct *Amat, *Bmat, *Cmat; 3026 CsrMatrix *Acsr, *Bcsr, *Ccsr; 3027 PetscInt i, j, m, n, k; 3028 PetscBool flg; 3029 cusparseStatus_t stat; 3030 MatProductType ptype; 3031 MatProductCtx_MatMatCusparse *mmdata; 3032 PetscLogDouble flops; 3033 PetscBool biscompressed, ciscompressed; 3034 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3035 int64_t C_num_rows1, C_num_cols1, C_nnz1; 3036 cusparseSpMatDescr_t BmatSpDescr; 3037 #else 3038 int cnz; 3039 #endif 3040 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE, opB = CUSPARSE_OPERATION_NON_TRANSPOSE; /* cuSPARSE spgemm doesn't support transpose yet */ 3041 3042 PetscFunctionBegin; 3043 MatCheckProduct(C, 1); 3044 PetscCheck(!C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Product data not empty"); 3045 A = product->A; 3046 B = product->B; 3047 PetscCall(PetscObjectTypeCompare((PetscObject)A, MATSEQAIJCUSPARSE, &flg)); 3048 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for type %s", ((PetscObject)A)->type_name); 3049 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQAIJCUSPARSE, &flg)); 3050 PetscCheck(flg, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Not for B of type %s", ((PetscObject)B)->type_name); 3051 a = (Mat_SeqAIJ *)A->data; 3052 b = (Mat_SeqAIJ *)B->data; 3053 /* product data */ 3054 PetscCall(PetscNew(&mmdata)); 3055 C->product->data = mmdata; 3056 C->product->destroy = MatProductCtxDestroy_MatMatCusparse; 3057 3058 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3059 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 3060 Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr; /* Access spptr after MatSeqAIJCUSPARSECopyToGPU, not before */ 3061 Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr; 3062 PetscCheck(Acusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3063 PetscCheck(Bcusp->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Only for MAT_CUSPARSE_CSR format"); 3064 3065 ptype = product->type; 3066 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 3067 ptype = MATPRODUCT_AB; 3068 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 3069 } 3070 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 3071 ptype = MATPRODUCT_AB; 3072 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 3073 } 3074 biscompressed = PETSC_FALSE; 3075 ciscompressed = PETSC_FALSE; 3076 switch (ptype) { 3077 case MATPRODUCT_AB: 3078 m = A->rmap->n; 3079 n = B->cmap->n; 3080 k = A->cmap->n; 3081 Amat = Acusp->mat; 3082 Bmat = Bcusp->mat; 3083 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3084 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3085 break; 3086 case MATPRODUCT_AtB: 3087 m = A->cmap->n; 3088 n = B->cmap->n; 3089 k = A->rmap->n; 3090 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3091 Amat = Acusp->matTranspose; 3092 Bmat = Bcusp->mat; 3093 if (b->compressedrow.use) biscompressed = PETSC_TRUE; 3094 break; 3095 case MATPRODUCT_ABt: 3096 m = A->rmap->n; 3097 n = B->rmap->n; 3098 k = A->cmap->n; 3099 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 3100 Amat = Acusp->mat; 3101 Bmat = Bcusp->matTranspose; 3102 if (a->compressedrow.use) ciscompressed = PETSC_TRUE; 3103 break; 3104 default: 3105 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Unsupported product type %s", MatProductTypes[product->type]); 3106 } 3107 3108 /* create cusparse matrix */ 3109 PetscCall(MatSetSizes(C, m, n, m, n)); 3110 PetscCall(MatSetType(C, MATSEQAIJCUSPARSE)); 3111 c = (Mat_SeqAIJ *)C->data; 3112 Ccusp = (Mat_SeqAIJCUSPARSE *)C->spptr; 3113 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 3114 Ccsr = new CsrMatrix; 3115 3116 c->compressedrow.use = ciscompressed; 3117 if (c->compressedrow.use) { /* if a is in compressed row, than c will be in compressed row format */ 3118 c->compressedrow.nrows = a->compressedrow.nrows; 3119 PetscCall(PetscMalloc2(c->compressedrow.nrows + 1, &c->compressedrow.i, c->compressedrow.nrows, &c->compressedrow.rindex)); 3120 PetscCall(PetscArraycpy(c->compressedrow.rindex, a->compressedrow.rindex, c->compressedrow.nrows)); 3121 Ccusp->workVector = new THRUSTARRAY(c->compressedrow.nrows); 3122 Cmat->cprowIndices = new THRUSTINTARRAY(c->compressedrow.nrows); 3123 Cmat->cprowIndices->assign(c->compressedrow.rindex, c->compressedrow.rindex + c->compressedrow.nrows); 3124 } else { 3125 c->compressedrow.nrows = 0; 3126 c->compressedrow.i = NULL; 3127 c->compressedrow.rindex = NULL; 3128 Ccusp->workVector = NULL; 3129 Cmat->cprowIndices = NULL; 3130 } 3131 Ccusp->nrows = ciscompressed ? c->compressedrow.nrows : m; 3132 Ccusp->mat = Cmat; 3133 Ccusp->mat->mat = Ccsr; 3134 Ccsr->num_rows = Ccusp->nrows; 3135 Ccsr->num_cols = n; 3136 Ccsr->row_offsets = new THRUSTINTARRAY32(Ccusp->nrows + 1); 3137 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 3138 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 3139 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 3140 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 3141 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 3142 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 3143 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3144 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3145 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 3146 if (!Ccsr->num_rows || !Ccsr->num_cols || !a->nz || !b->nz) { /* cusparse raise errors in different calls when matrices have zero rows/columns! */ 3147 PetscCallThrust(thrust::fill(thrust::device, Ccsr->row_offsets->begin(), Ccsr->row_offsets->end(), 0)); 3148 c->nz = 0; 3149 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3150 Ccsr->values = new THRUSTARRAY(c->nz); 3151 goto finalizesym; 3152 } 3153 3154 PetscCheck(Amat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A mult struct for product type %s", MatProductTypes[ptype]); 3155 PetscCheck(Bmat, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B mult struct for product type %s", MatProductTypes[ptype]); 3156 Acsr = (CsrMatrix *)Amat->mat; 3157 if (!biscompressed) { 3158 Bcsr = (CsrMatrix *)Bmat->mat; 3159 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3160 BmatSpDescr = Bmat->matDescr; 3161 #endif 3162 } else { /* we need to use row offsets for the full matrix */ 3163 CsrMatrix *cBcsr = (CsrMatrix *)Bmat->mat; 3164 Bcsr = new CsrMatrix; 3165 Bcsr->num_rows = B->rmap->n; 3166 Bcsr->num_cols = cBcsr->num_cols; 3167 Bcsr->num_entries = cBcsr->num_entries; 3168 Bcsr->column_indices = cBcsr->column_indices; 3169 Bcsr->values = cBcsr->values; 3170 if (!Bcusp->rowoffsets_gpu) { 3171 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 3172 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 3173 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 3174 } 3175 Bcsr->row_offsets = Bcusp->rowoffsets_gpu; 3176 mmdata->Bcsr = Bcsr; 3177 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3178 if (Bcsr->num_rows && Bcsr->num_cols) { 3179 stat = cusparseCreateCsr(&mmdata->matSpBDescr, Bcsr->num_rows, Bcsr->num_cols, Bcsr->num_entries, Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Bcsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3180 PetscCallCUSPARSE(stat); 3181 } 3182 BmatSpDescr = mmdata->matSpBDescr; 3183 #endif 3184 } 3185 PetscCheck(Acsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing A CSR struct"); 3186 PetscCheck(Bcsr, PetscObjectComm((PetscObject)C), PETSC_ERR_GPU, "Missing B CSR struct"); 3187 /* precompute flops count */ 3188 if (ptype == MATPRODUCT_AB) { 3189 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3190 const PetscInt st = a->i[i]; 3191 const PetscInt en = a->i[i + 1]; 3192 for (j = st; j < en; j++) { 3193 const PetscInt brow = a->j[j]; 3194 flops += 2. * (b->i[brow + 1] - b->i[brow]); 3195 } 3196 } 3197 } else if (ptype == MATPRODUCT_AtB) { 3198 for (i = 0, flops = 0; i < A->rmap->n; i++) { 3199 const PetscInt anzi = a->i[i + 1] - a->i[i]; 3200 const PetscInt bnzi = b->i[i + 1] - b->i[i]; 3201 flops += (2. * anzi) * bnzi; 3202 } 3203 } else { /* TODO */ 3204 flops = 0.; 3205 } 3206 3207 mmdata->flops = flops; 3208 PetscCall(PetscLogGpuTimeBegin()); 3209 3210 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3211 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3212 // cuda-12.2 requires non-null csrRowOffsets 3213 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, 0, Ccsr->row_offsets->data().get(), NULL, NULL, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 3214 PetscCallCUSPARSE(stat); 3215 PetscCallCUSPARSE(cusparseSpGEMM_createDescr(&mmdata->spgemmDesc)); 3216 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 3217 { 3218 /* cusparseSpGEMMreuse has more reasonable APIs than cusparseSpGEMM, so we prefer to use it. 3219 We follow the sample code at https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuSPARSE/spgemm_reuse 3220 */ 3221 void *dBuffer1 = NULL; 3222 void *dBuffer2 = NULL; 3223 void *dBuffer3 = NULL; 3224 /* dBuffer4, dBuffer5 are needed by cusparseSpGEMMreuse_compute, and therefore are stored in mmdata */ 3225 size_t bufferSize1 = 0; 3226 size_t bufferSize2 = 0; 3227 size_t bufferSize3 = 0; 3228 size_t bufferSize4 = 0; 3229 size_t bufferSize5 = 0; 3230 3231 /* ask bufferSize1 bytes for external memory */ 3232 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, NULL); 3233 PetscCallCUSPARSE(stat); 3234 PetscCallCUDA(cudaMalloc((void **)&dBuffer1, bufferSize1)); 3235 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3236 stat = cusparseSpGEMMreuse_workEstimation(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize1, dBuffer1); 3237 PetscCallCUSPARSE(stat); 3238 3239 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, NULL, &bufferSize3, NULL, &bufferSize4, NULL); 3240 PetscCallCUSPARSE(stat); 3241 PetscCallCUDA(cudaMalloc((void **)&dBuffer2, bufferSize2)); 3242 PetscCallCUDA(cudaMalloc((void **)&dBuffer3, bufferSize3)); 3243 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer4, bufferSize4)); 3244 stat = cusparseSpGEMMreuse_nnz(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize2, dBuffer2, &bufferSize3, dBuffer3, &bufferSize4, mmdata->dBuffer4); 3245 PetscCallCUSPARSE(stat); 3246 PetscCallCUDA(cudaFree(dBuffer1)); 3247 PetscCallCUDA(cudaFree(dBuffer2)); 3248 3249 /* get matrix C non-zero entries C_nnz1 */ 3250 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3251 c->nz = (PetscInt)C_nnz1; 3252 /* allocate matrix C */ 3253 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3254 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3255 Ccsr->values = new THRUSTARRAY(c->nz); 3256 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3257 /* update matC with the new pointers */ 3258 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3259 PetscCallCUSPARSE(stat); 3260 3261 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, NULL); 3262 PetscCallCUSPARSE(stat); 3263 PetscCallCUDA(cudaMalloc((void **)&mmdata->dBuffer5, bufferSize5)); 3264 stat = cusparseSpGEMMreuse_copy(Ccusp->handle, opA, opB, Amat->matDescr, BmatSpDescr, Cmat->matDescr, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufferSize5, mmdata->dBuffer5); 3265 PetscCallCUSPARSE(stat); 3266 PetscCallCUDA(cudaFree(dBuffer3)); 3267 stat = cusparseSpGEMMreuse_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3268 PetscCallCUSPARSE(stat); 3269 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufferSize4 / 1024, bufferSize5 / 1024)); 3270 } 3271 #else 3272 size_t bufSize2; 3273 /* ask bufferSize bytes for external memory */ 3274 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, NULL); 3275 PetscCallCUSPARSE(stat); 3276 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer2, bufSize2)); 3277 /* inspect the matrices A and B to understand the memory requirement for the next step */ 3278 stat = cusparseSpGEMM_workEstimation(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &bufSize2, mmdata->mmBuffer2); 3279 PetscCallCUSPARSE(stat); 3280 /* ask bufferSize again bytes for external memory */ 3281 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, NULL); 3282 PetscCallCUSPARSE(stat); 3283 /* The CUSPARSE documentation is not clear, nor the API 3284 We need both buffers to perform the operations properly! 3285 mmdata->mmBuffer2 does not appear anywhere in the compute/copy API 3286 it only appears for the workEstimation stuff, but it seems it is needed in compute, so probably the address 3287 is stored in the descriptor! What a messy API... */ 3288 PetscCallCUDA(cudaMalloc((void **)&mmdata->mmBuffer, mmdata->mmBufferSize)); 3289 /* compute the intermediate product of A * B */ 3290 stat = cusparseSpGEMM_compute(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc, &mmdata->mmBufferSize, mmdata->mmBuffer); 3291 PetscCallCUSPARSE(stat); 3292 /* get matrix C non-zero entries C_nnz1 */ 3293 PetscCallCUSPARSE(cusparseSpMatGetSize(Cmat->matDescr, &C_num_rows1, &C_num_cols1, &C_nnz1)); 3294 c->nz = (PetscInt)C_nnz1; 3295 PetscCall(PetscInfo(C, "Buffer sizes for type %s, result %" PetscInt_FMT " x %" PetscInt_FMT " (k %" PetscInt_FMT ", nzA %" PetscInt_FMT ", nzB %" PetscInt_FMT ", nzC %" PetscInt_FMT ") are: %ldKB %ldKB\n", MatProductTypes[ptype], m, n, k, a->nz, b->nz, c->nz, bufSize2 / 1024, 3296 mmdata->mmBufferSize / 1024)); 3297 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3298 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3299 Ccsr->values = new THRUSTARRAY(c->nz); 3300 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3301 stat = cusparseCsrSetPointers(Cmat->matDescr, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get()); 3302 PetscCallCUSPARSE(stat); 3303 stat = cusparseSpGEMM_copy(Ccusp->handle, opA, opB, Cmat->alpha_one, Amat->matDescr, BmatSpDescr, Cmat->beta_zero, Cmat->matDescr, cusparse_scalartype, CUSPARSE_SPGEMM_DEFAULT, mmdata->spgemmDesc); 3304 PetscCallCUSPARSE(stat); 3305 #endif // PETSC_PKG_CUDA_VERSION_GE(11,4,0) 3306 #else 3307 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_HOST)); 3308 stat = cusparseXcsrgemmNnz(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3309 Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->row_offsets->data().get(), &cnz); 3310 PetscCallCUSPARSE(stat); 3311 c->nz = cnz; 3312 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 3313 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3314 Ccsr->values = new THRUSTARRAY(c->nz); 3315 PetscCallCUDA(cudaPeekAtLastError()); /* catch out of memory errors */ 3316 3317 PetscCallCUSPARSE(cusparseSetPointerMode(Ccusp->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3318 /* with the old gemm interface (removed from 11.0 on) we cannot compute the symbolic factorization only. 3319 I have tried using the gemm2 interface (alpha * A * B + beta * D), which allows to do symbolic by passing NULL for values, but it seems quite buggy when 3320 D is NULL, despite the fact that CUSPARSE documentation claims it is supported! */ 3321 stat = cusparse_csr_spgemm(Ccusp->handle, opA, opB, Acsr->num_rows, Bcsr->num_cols, Acsr->num_cols, Amat->descr, Acsr->num_entries, Acsr->values->data().get(), Acsr->row_offsets->data().get(), Acsr->column_indices->data().get(), Bmat->descr, Bcsr->num_entries, 3322 Bcsr->values->data().get(), Bcsr->row_offsets->data().get(), Bcsr->column_indices->data().get(), Cmat->descr, Ccsr->values->data().get(), Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get()); 3323 PetscCallCUSPARSE(stat); 3324 #endif 3325 PetscCall(PetscLogGpuFlops(mmdata->flops)); 3326 PetscCall(PetscLogGpuTimeEnd()); 3327 finalizesym: 3328 c->free_a = PETSC_TRUE; 3329 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 3330 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 3331 c->free_ij = PETSC_TRUE; 3332 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 3333 PetscInt *d_i = c->i; 3334 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 3335 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 3336 ii = *Ccsr->row_offsets; 3337 jj = *Ccsr->column_indices; 3338 if (ciscompressed) d_i = c->compressedrow.i; 3339 PetscCallCUDA(cudaMemcpy(d_i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3340 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3341 } else { 3342 PetscInt *d_i = c->i; 3343 if (ciscompressed) d_i = c->compressedrow.i; 3344 PetscCallCUDA(cudaMemcpy(d_i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3345 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 3346 } 3347 if (ciscompressed) { /* need to expand host row offsets */ 3348 PetscInt r = 0; 3349 c->i[0] = 0; 3350 for (k = 0; k < c->compressedrow.nrows; k++) { 3351 const PetscInt next = c->compressedrow.rindex[k]; 3352 const PetscInt old = c->compressedrow.i[k]; 3353 for (; r < next; r++) c->i[r + 1] = old; 3354 } 3355 for (; r < m; r++) c->i[r + 1] = c->compressedrow.i[c->compressedrow.nrows]; 3356 } 3357 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 3358 PetscCall(PetscMalloc1(m, &c->ilen)); 3359 PetscCall(PetscMalloc1(m, &c->imax)); 3360 c->maxnz = c->nz; 3361 c->nonzerorowcnt = 0; 3362 c->rmax = 0; 3363 for (k = 0; k < m; k++) { 3364 const PetscInt nn = c->i[k + 1] - c->i[k]; 3365 c->ilen[k] = c->imax[k] = nn; 3366 c->nonzerorowcnt += (PetscInt)!!nn; 3367 c->rmax = PetscMax(c->rmax, nn); 3368 } 3369 PetscCall(MatMarkDiagonal_SeqAIJ(C)); 3370 PetscCall(PetscMalloc1(c->nz, &c->a)); 3371 Ccsr->num_entries = c->nz; 3372 3373 C->nonzerostate++; 3374 PetscCall(PetscLayoutSetUp(C->rmap)); 3375 PetscCall(PetscLayoutSetUp(C->cmap)); 3376 Ccusp->nonzerostate = C->nonzerostate; 3377 C->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 3378 C->preallocated = PETSC_TRUE; 3379 C->assembled = PETSC_FALSE; 3380 C->was_assembled = PETSC_FALSE; 3381 if (product->api_user && A->offloadmask == PETSC_OFFLOAD_BOTH && B->offloadmask == PETSC_OFFLOAD_BOTH) { /* flag the matrix C values as computed, so that the numeric phase will only call MatAssembly */ 3382 mmdata->reusesym = PETSC_TRUE; 3383 C->offloadmask = PETSC_OFFLOAD_GPU; 3384 } 3385 C->ops->productnumeric = MatProductNumeric_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3386 PetscFunctionReturn(PETSC_SUCCESS); 3387 } 3388 3389 PETSC_INTERN PetscErrorCode MatProductSetFromOptions_SeqAIJ_SeqDense(Mat); 3390 3391 /* handles sparse or dense B */ 3392 static PetscErrorCode MatProductSetFromOptions_SeqAIJCUSPARSE(Mat mat) 3393 { 3394 Mat_Product *product = mat->product; 3395 PetscBool isdense = PETSC_FALSE, Biscusp = PETSC_FALSE, Ciscusp = PETSC_TRUE; 3396 3397 PetscFunctionBegin; 3398 MatCheckProduct(mat, 1); 3399 PetscCall(PetscObjectBaseTypeCompare((PetscObject)product->B, MATSEQDENSE, &isdense)); 3400 if (!product->A->boundtocpu && !product->B->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJCUSPARSE, &Biscusp)); 3401 if (product->type == MATPRODUCT_ABC) { 3402 Ciscusp = PETSC_FALSE; 3403 if (!product->C->boundtocpu) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJCUSPARSE, &Ciscusp)); 3404 } 3405 if (Biscusp && Ciscusp) { /* we can always select the CPU backend */ 3406 PetscBool usecpu = PETSC_FALSE; 3407 switch (product->type) { 3408 case MATPRODUCT_AB: 3409 if (product->api_user) { 3410 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMult", "Mat"); 3411 PetscCall(PetscOptionsBool("-matmatmult_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3412 PetscOptionsEnd(); 3413 } else { 3414 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AB", "Mat"); 3415 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMult", usecpu, &usecpu, NULL)); 3416 PetscOptionsEnd(); 3417 } 3418 break; 3419 case MATPRODUCT_AtB: 3420 if (product->api_user) { 3421 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatTransposeMatMult", "Mat"); 3422 PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3423 PetscOptionsEnd(); 3424 } else { 3425 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_AtB", "Mat"); 3426 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatTransposeMatMult", usecpu, &usecpu, NULL)); 3427 PetscOptionsEnd(); 3428 } 3429 break; 3430 case MATPRODUCT_PtAP: 3431 if (product->api_user) { 3432 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatPtAP", "Mat"); 3433 PetscCall(PetscOptionsBool("-matptap_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3434 PetscOptionsEnd(); 3435 } else { 3436 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_PtAP", "Mat"); 3437 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatPtAP", usecpu, &usecpu, NULL)); 3438 PetscOptionsEnd(); 3439 } 3440 break; 3441 case MATPRODUCT_RARt: 3442 if (product->api_user) { 3443 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatRARt", "Mat"); 3444 PetscCall(PetscOptionsBool("-matrart_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3445 PetscOptionsEnd(); 3446 } else { 3447 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_RARt", "Mat"); 3448 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatRARt", usecpu, &usecpu, NULL)); 3449 PetscOptionsEnd(); 3450 } 3451 break; 3452 case MATPRODUCT_ABC: 3453 if (product->api_user) { 3454 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatMatMatMult", "Mat"); 3455 PetscCall(PetscOptionsBool("-matmatmatmult_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3456 PetscOptionsEnd(); 3457 } else { 3458 PetscOptionsBegin(PetscObjectComm((PetscObject)mat), ((PetscObject)mat)->prefix, "MatProduct_ABC", "Mat"); 3459 PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu", "Use CPU code", "MatMatMatMult", usecpu, &usecpu, NULL)); 3460 PetscOptionsEnd(); 3461 } 3462 break; 3463 default: 3464 break; 3465 } 3466 if (usecpu) Biscusp = Ciscusp = PETSC_FALSE; 3467 } 3468 /* dispatch */ 3469 if (isdense) { 3470 switch (product->type) { 3471 case MATPRODUCT_AB: 3472 case MATPRODUCT_AtB: 3473 case MATPRODUCT_ABt: 3474 case MATPRODUCT_PtAP: 3475 case MATPRODUCT_RARt: 3476 if (product->A->boundtocpu) { 3477 PetscCall(MatProductSetFromOptions_SeqAIJ_SeqDense(mat)); 3478 } else { 3479 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqDENSECUDA; 3480 } 3481 break; 3482 case MATPRODUCT_ABC: 3483 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3484 break; 3485 default: 3486 break; 3487 } 3488 } else if (Biscusp && Ciscusp) { 3489 switch (product->type) { 3490 case MATPRODUCT_AB: 3491 case MATPRODUCT_AtB: 3492 case MATPRODUCT_ABt: 3493 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJCUSPARSE_SeqAIJCUSPARSE; 3494 break; 3495 case MATPRODUCT_PtAP: 3496 case MATPRODUCT_RARt: 3497 case MATPRODUCT_ABC: 3498 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 3499 break; 3500 default: 3501 break; 3502 } 3503 } else { /* fallback for AIJ */ 3504 PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 3505 } 3506 PetscFunctionReturn(PETSC_SUCCESS); 3507 } 3508 3509 static PetscErrorCode MatMult_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3510 { 3511 PetscFunctionBegin; 3512 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_FALSE, PETSC_FALSE)); 3513 PetscFunctionReturn(PETSC_SUCCESS); 3514 } 3515 3516 static PetscErrorCode MatMultAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3517 { 3518 PetscFunctionBegin; 3519 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_FALSE, PETSC_FALSE)); 3520 PetscFunctionReturn(PETSC_SUCCESS); 3521 } 3522 3523 static PetscErrorCode MatMultHermitianTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3524 { 3525 PetscFunctionBegin; 3526 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_TRUE)); 3527 PetscFunctionReturn(PETSC_SUCCESS); 3528 } 3529 3530 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3531 { 3532 PetscFunctionBegin; 3533 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_TRUE)); 3534 PetscFunctionReturn(PETSC_SUCCESS); 3535 } 3536 3537 static PetscErrorCode MatMultTranspose_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy) 3538 { 3539 PetscFunctionBegin; 3540 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, NULL, yy, PETSC_TRUE, PETSC_FALSE)); 3541 PetscFunctionReturn(PETSC_SUCCESS); 3542 } 3543 3544 __global__ static void ScatterAdd(PetscInt n, PetscInt *idx, const PetscScalar *x, PetscScalar *y) 3545 { 3546 int i = blockIdx.x * blockDim.x + threadIdx.x; 3547 if (i < n) y[idx[i]] += x[i]; 3548 } 3549 3550 /* z = op(A) x + y. If trans & !herm, op = ^T; if trans & herm, op = ^H; if !trans, op = no-op */ 3551 static PetscErrorCode MatMultAddKernel_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz, PetscBool trans, PetscBool herm) 3552 { 3553 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3554 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3555 Mat_SeqAIJCUSPARSEMultStruct *matstruct; 3556 PetscScalar *xarray, *zarray, *dptr, *beta, *xptr; 3557 cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; 3558 PetscBool compressed; 3559 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3560 PetscInt nx, ny; 3561 #endif 3562 3563 PetscFunctionBegin; 3564 PetscCheck(!herm || trans, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "Hermitian and not transpose not supported"); 3565 if (!a->nz) { 3566 if (yy) PetscCall(VecSeq_CUDA::Copy(yy, zz)); 3567 else PetscCall(VecSeq_CUDA::Set(zz, 0)); 3568 PetscFunctionReturn(PETSC_SUCCESS); 3569 } 3570 /* The line below is necessary due to the operations that modify the matrix on the CPU (axpy, scale, etc) */ 3571 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 3572 if (!trans) { 3573 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3574 PetscCheck(matstruct, PetscObjectComm((PetscObject)A), PETSC_ERR_GPU, "SeqAIJCUSPARSE does not have a 'mat' (need to fix)"); 3575 } else { 3576 if (herm || !A->form_explicit_transpose) { 3577 opA = herm ? CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE : CUSPARSE_OPERATION_TRANSPOSE; 3578 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3579 } else { 3580 if (!cusparsestruct->matTranspose) PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 3581 matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->matTranspose; 3582 } 3583 } 3584 /* Does the matrix use compressed rows (i.e., drop zero rows)? */ 3585 compressed = matstruct->cprowIndices ? PETSC_TRUE : PETSC_FALSE; 3586 3587 try { 3588 PetscCall(VecCUDAGetArrayRead(xx, (const PetscScalar **)&xarray)); 3589 if (yy == zz) PetscCall(VecCUDAGetArray(zz, &zarray)); /* read & write zz, so need to get up-to-date zarray on GPU */ 3590 else PetscCall(VecCUDAGetArrayWrite(zz, &zarray)); /* write zz, so no need to init zarray on GPU */ 3591 3592 PetscCall(PetscLogGpuTimeBegin()); 3593 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3594 /* z = A x + beta y. 3595 If A is compressed (with less rows), then Ax is shorter than the full z, so we need a work vector to store Ax. 3596 When A is non-compressed, and z = y, we can set beta=1 to compute y = Ax + y in one call. 3597 */ 3598 xptr = xarray; 3599 dptr = compressed ? cusparsestruct->workVector->data().get() : zarray; 3600 beta = (yy == zz && !compressed) ? matstruct->beta_one : matstruct->beta_zero; 3601 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3602 /* Get length of x, y for y=Ax. ny might be shorter than the work vector's allocated length, since the work vector is 3603 allocated to accommodate different uses. So we get the length info directly from mat. 3604 */ 3605 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3606 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3607 nx = mat->num_cols; // since y = Ax 3608 ny = mat->num_rows; 3609 } 3610 #endif 3611 } else { 3612 /* z = A^T x + beta y 3613 If A is compressed, then we need a work vector as the shorter version of x to compute A^T x. 3614 Note A^Tx is of full length, so we set beta to 1.0 if y exists. 3615 */ 3616 xptr = compressed ? cusparsestruct->workVector->data().get() : xarray; 3617 dptr = zarray; 3618 beta = yy ? matstruct->beta_one : matstruct->beta_zero; 3619 if (compressed) { /* Scatter x to work vector */ 3620 thrust::device_ptr<PetscScalar> xarr = thrust::device_pointer_cast(xarray); 3621 3622 thrust::for_each( 3623 #if PetscDefined(HAVE_THRUST_ASYNC) 3624 thrust::cuda::par.on(PetscDefaultCudaStream), 3625 #endif 3626 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))), 3627 thrust::make_zip_iterator(thrust::make_tuple(cusparsestruct->workVector->begin(), thrust::make_permutation_iterator(xarr, matstruct->cprowIndices->begin()))) + matstruct->cprowIndices->size(), VecCUDAEqualsReverse()); 3628 } 3629 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3630 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3631 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3632 nx = mat->num_rows; // since y = A^T x 3633 ny = mat->num_cols; 3634 } 3635 #endif 3636 } 3637 3638 /* csr_spmv does y = alpha op(A) x + beta y */ 3639 if (cusparsestruct->format == MAT_CUSPARSE_CSR) { 3640 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3641 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3642 cusparseSpMatDescr_t &matDescr = matstruct->matDescr_SpMV[opA]; // All opA's should use the same matDescr, but the cusparse issue/bug (#212) after 12.4 forced us to create a new one for each opA. 3643 #else 3644 cusparseSpMatDescr_t &matDescr = matstruct->matDescr; 3645 #endif 3646 3647 PetscCheck(opA >= 0 && opA <= 2, PETSC_COMM_SELF, PETSC_ERR_SUP, "cuSPARSE ABI on cusparseOperation_t has changed and PETSc has not been updated accordingly"); 3648 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 3649 if (!matDescr) { 3650 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3651 PetscCallCUSPARSE(cusparseCreateCsr(&matDescr, mat->num_rows, mat->num_cols, mat->num_entries, mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype)); 3652 } 3653 #endif 3654 3655 if (!matstruct->cuSpMV[opA].initialized) { /* built on demand */ 3656 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecXDescr, nx, xptr, cusparse_scalartype)); 3657 PetscCallCUSPARSE(cusparseCreateDnVec(&matstruct->cuSpMV[opA].vecYDescr, ny, dptr, cusparse_scalartype)); 3658 PetscCallCUSPARSE( 3659 cusparseSpMV_bufferSize(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, &matstruct->cuSpMV[opA].spmvBufferSize)); 3660 PetscCallCUDA(cudaMalloc(&matstruct->cuSpMV[opA].spmvBuffer, matstruct->cuSpMV[opA].spmvBufferSize)); 3661 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) // cusparseSpMV_preprocess is added in 12.4 3662 PetscCallCUSPARSE( 3663 cusparseSpMV_preprocess(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3664 #endif 3665 matstruct->cuSpMV[opA].initialized = PETSC_TRUE; 3666 } else { 3667 /* x, y's value pointers might change between calls, but their shape is kept, so we just update pointers */ 3668 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecXDescr, xptr)); 3669 PetscCallCUSPARSE(cusparseDnVecSetValues(matstruct->cuSpMV[opA].vecYDescr, dptr)); 3670 } 3671 3672 PetscCallCUSPARSE(cusparseSpMV(cusparsestruct->handle, opA, matstruct->alpha_one, matDescr, matstruct->cuSpMV[opA].vecXDescr, beta, matstruct->cuSpMV[opA].vecYDescr, cusparse_scalartype, cusparsestruct->spmvAlg, matstruct->cuSpMV[opA].spmvBuffer)); 3673 #else 3674 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3675 PetscCallCUSPARSE(cusparse_csr_spmv(cusparsestruct->handle, opA, mat->num_rows, mat->num_cols, mat->num_entries, matstruct->alpha_one, matstruct->descr, mat->values->data().get(), mat->row_offsets->data().get(), mat->column_indices->data().get(), xptr, beta, dptr)); 3676 #endif 3677 } else { 3678 if (cusparsestruct->nrows) { 3679 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3680 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 3681 #else 3682 cusparseHybMat_t hybMat = (cusparseHybMat_t)matstruct->mat; 3683 PetscCallCUSPARSE(cusparse_hyb_spmv(cusparsestruct->handle, opA, matstruct->alpha_one, matstruct->descr, hybMat, xptr, beta, dptr)); 3684 #endif 3685 } 3686 } 3687 PetscCall(PetscLogGpuTimeEnd()); 3688 3689 if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 3690 if (yy) { /* MatMultAdd: zz = A*xx + yy */ 3691 if (compressed) { /* A is compressed. We first copy yy to zz, then ScatterAdd the work vector to zz */ 3692 PetscCall(VecSeq_CUDA::Copy(yy, zz)); /* zz = yy */ 3693 } else if (zz != yy) { /* A is not compressed. zz already contains A*xx, and we just need to add yy */ 3694 PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3695 } 3696 } else if (compressed) { /* MatMult: zz = A*xx. A is compressed, so we zero zz first, then ScatterAdd the work vector to zz */ 3697 PetscCall(VecSeq_CUDA::Set(zz, 0)); 3698 } 3699 3700 /* ScatterAdd the result from work vector into the full vector when A is compressed */ 3701 if (compressed) { 3702 PetscCall(PetscLogGpuTimeBegin()); 3703 PetscInt n = (PetscInt)matstruct->cprowIndices->size(); 3704 ScatterAdd<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(n, matstruct->cprowIndices->data().get(), cusparsestruct->workVector->data().get(), zarray); 3705 PetscCall(PetscLogGpuTimeEnd()); 3706 } 3707 } else { 3708 if (yy && yy != zz) PetscCall(VecSeq_CUDA::AXPY(zz, 1.0, yy)); /* zz += yy */ 3709 } 3710 PetscCall(VecCUDARestoreArrayRead(xx, (const PetscScalar **)&xarray)); 3711 if (yy == zz) PetscCall(VecCUDARestoreArray(zz, &zarray)); 3712 else PetscCall(VecCUDARestoreArrayWrite(zz, &zarray)); 3713 } catch (char *ex) { 3714 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_LIB, "CUSPARSE error: %s", ex); 3715 } 3716 if (yy) { 3717 PetscCall(PetscLogGpuFlops(2.0 * a->nz)); 3718 } else { 3719 PetscCall(PetscLogGpuFlops(2.0 * a->nz - a->nonzerorowcnt)); 3720 } 3721 PetscFunctionReturn(PETSC_SUCCESS); 3722 } 3723 3724 static PetscErrorCode MatMultTransposeAdd_SeqAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 3725 { 3726 PetscFunctionBegin; 3727 PetscCall(MatMultAddKernel_SeqAIJCUSPARSE(A, xx, yy, zz, PETSC_TRUE, PETSC_FALSE)); 3728 PetscFunctionReturn(PETSC_SUCCESS); 3729 } 3730 3731 PETSC_INTERN PetscErrorCode MatGetDiagonal_SeqAIJ(Mat A, Vec xx); 3732 3733 __global__ static void GetDiagonal_CSR(const int *row, const int *col, const PetscScalar *val, const PetscInt len, PetscScalar *diag) 3734 { 3735 const size_t x = blockIdx.x * blockDim.x + threadIdx.x; 3736 3737 if (x < len) { 3738 const PetscInt rowx = row[x], num_non0_row = row[x + 1] - rowx; 3739 PetscScalar d = 0.0; 3740 3741 for (PetscInt i = 0; i < num_non0_row; i++) { 3742 if (col[i + rowx] == x) { 3743 d = val[i + rowx]; 3744 break; 3745 } 3746 } 3747 diag[x] = d; 3748 } 3749 } 3750 3751 static PetscErrorCode MatGetDiagonal_SeqAIJCUSPARSE(Mat A, Vec diag) 3752 { 3753 Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE *)A->spptr; 3754 Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestruct->mat; 3755 PetscScalar *darray; 3756 3757 PetscFunctionBegin; 3758 if (A->offloadmask == PETSC_OFFLOAD_BOTH || A->offloadmask == PETSC_OFFLOAD_GPU) { 3759 PetscInt n = A->rmap->n; 3760 CsrMatrix *mat = (CsrMatrix *)matstruct->mat; 3761 3762 PetscCheck(cusparsestruct->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Only CSR format supported"); 3763 if (n > 0) { 3764 PetscCall(VecCUDAGetArrayWrite(diag, &darray)); 3765 GetDiagonal_CSR<<<(int)((n + 255) / 256), 256, 0, PetscDefaultCudaStream>>>(mat->row_offsets->data().get(), mat->column_indices->data().get(), mat->values->data().get(), n, darray); 3766 PetscCallCUDA(cudaPeekAtLastError()); 3767 PetscCall(VecCUDARestoreArrayWrite(diag, &darray)); 3768 } 3769 } else PetscCall(MatGetDiagonal_SeqAIJ(A, diag)); 3770 PetscFunctionReturn(PETSC_SUCCESS); 3771 } 3772 3773 static PetscErrorCode MatAssemblyEnd_SeqAIJCUSPARSE(Mat A, MatAssemblyType mode) 3774 { 3775 PetscFunctionBegin; 3776 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 3777 PetscFunctionReturn(PETSC_SUCCESS); 3778 } 3779 3780 /*@ 3781 MatCreateSeqAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format for use on NVIDIA GPUs 3782 3783 Collective 3784 3785 Input Parameters: 3786 + comm - MPI communicator, set to `PETSC_COMM_SELF` 3787 . m - number of rows 3788 . n - number of columns 3789 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provide 3790 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 3791 3792 Output Parameter: 3793 . A - the matrix 3794 3795 Level: intermediate 3796 3797 Notes: 3798 This matrix will ultimately pushed down to NVIDIA GPUs and use the CuSPARSE library for 3799 calculations. For good matrix assembly performance the user should preallocate the matrix 3800 storage by setting the parameter `nz` (or the array `nnz`). 3801 3802 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 3803 MatXXXXSetPreallocation() paradgm instead of this routine directly. 3804 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 3805 3806 The AIJ format, also called 3807 compressed row storage, is fully compatible with standard Fortran 3808 storage. That is, the stored row and column indices can begin at 3809 either one (as in Fortran) or zero. 3810 3811 Specify the preallocated storage with either nz or nnz (not both). 3812 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 3813 allocation. 3814 3815 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()` 3816 3817 .seealso: [](ch_matrices), `Mat`, `MATSEQAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATAIJCUSPARSE`, 3818 `MatSetPreallocationCOO()`, `MatSetValuesCOO()` 3819 @*/ 3820 PetscErrorCode MatCreateSeqAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 3821 { 3822 PetscFunctionBegin; 3823 PetscCall(MatCreate(comm, A)); 3824 PetscCall(MatSetSizes(*A, m, n, m, n)); 3825 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 3826 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 3827 PetscFunctionReturn(PETSC_SUCCESS); 3828 } 3829 3830 static PetscErrorCode MatDestroy_SeqAIJCUSPARSE(Mat A) 3831 { 3832 PetscFunctionBegin; 3833 if (A->factortype == MAT_FACTOR_NONE) { 3834 PetscCall(MatSeqAIJCUSPARSE_Destroy(A)); 3835 } else { 3836 PetscCall(MatSeqAIJCUSPARSETriFactors_Destroy((Mat_SeqAIJCUSPARSETriFactors **)&A->spptr)); 3837 } 3838 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 3839 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 3840 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetUseCPUSolve_C", NULL)); 3841 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 3842 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 3843 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 3844 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 3845 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 3846 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 3847 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijcusparse_hypre_C", NULL)); 3848 PetscCall(MatDestroy_SeqAIJ(A)); 3849 PetscFunctionReturn(PETSC_SUCCESS); 3850 } 3851 3852 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat, MatType, MatReuse, Mat *); 3853 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat, PetscBool); 3854 static PetscErrorCode MatDuplicate_SeqAIJCUSPARSE(Mat A, MatDuplicateOption cpvalues, Mat *B) 3855 { 3856 PetscFunctionBegin; 3857 PetscCall(MatDuplicate_SeqAIJ(A, cpvalues, B)); 3858 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(*B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, B)); 3859 PetscFunctionReturn(PETSC_SUCCESS); 3860 } 3861 3862 static PetscErrorCode MatAXPY_SeqAIJCUSPARSE(Mat Y, PetscScalar a, Mat X, MatStructure str) 3863 { 3864 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 3865 Mat_SeqAIJCUSPARSE *cy; 3866 Mat_SeqAIJCUSPARSE *cx; 3867 PetscScalar *ay; 3868 const PetscScalar *ax; 3869 CsrMatrix *csry, *csrx; 3870 3871 PetscFunctionBegin; 3872 cy = (Mat_SeqAIJCUSPARSE *)Y->spptr; 3873 cx = (Mat_SeqAIJCUSPARSE *)X->spptr; 3874 if (X->ops->axpy != Y->ops->axpy) { 3875 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3876 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3877 PetscFunctionReturn(PETSC_SUCCESS); 3878 } 3879 /* if we are here, it means both matrices are bound to GPU */ 3880 PetscCall(MatSeqAIJCUSPARSECopyToGPU(Y)); 3881 PetscCall(MatSeqAIJCUSPARSECopyToGPU(X)); 3882 PetscCheck(cy->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)Y), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3883 PetscCheck(cx->format == MAT_CUSPARSE_CSR, PetscObjectComm((PetscObject)X), PETSC_ERR_GPU, "only MAT_CUSPARSE_CSR supported"); 3884 csry = (CsrMatrix *)cy->mat->mat; 3885 csrx = (CsrMatrix *)cx->mat->mat; 3886 /* see if we can turn this into a cublas axpy */ 3887 if (str != SAME_NONZERO_PATTERN && x->nz == y->nz && !x->compressedrow.use && !y->compressedrow.use) { 3888 bool eq = thrust::equal(thrust::device, csry->row_offsets->begin(), csry->row_offsets->end(), csrx->row_offsets->begin()); 3889 if (eq) eq = thrust::equal(thrust::device, csry->column_indices->begin(), csry->column_indices->end(), csrx->column_indices->begin()); 3890 if (eq) str = SAME_NONZERO_PATTERN; 3891 } 3892 /* spgeam is buggy with one column */ 3893 if (Y->cmap->n == 1 && str != SAME_NONZERO_PATTERN) str = DIFFERENT_NONZERO_PATTERN; 3894 3895 if (str == SUBSET_NONZERO_PATTERN) { 3896 PetscScalar b = 1.0; 3897 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3898 size_t bufferSize; 3899 void *buffer; 3900 #endif 3901 3902 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3903 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3904 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_HOST)); 3905 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 3906 PetscCallCUSPARSE(cusparse_csr_spgeam_bufferSize(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3907 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), &bufferSize)); 3908 PetscCallCUDA(cudaMalloc(&buffer, bufferSize)); 3909 PetscCall(PetscLogGpuTimeBegin()); 3910 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3911 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get(), buffer)); 3912 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3913 PetscCall(PetscLogGpuTimeEnd()); 3914 PetscCallCUDA(cudaFree(buffer)); 3915 #else 3916 PetscCall(PetscLogGpuTimeBegin()); 3917 PetscCallCUSPARSE(cusparse_csr_spgeam(cy->handle, Y->rmap->n, Y->cmap->n, &a, cx->mat->descr, x->nz, ax, csrx->row_offsets->data().get(), csrx->column_indices->data().get(), &b, cy->mat->descr, y->nz, ay, csry->row_offsets->data().get(), 3918 csry->column_indices->data().get(), cy->mat->descr, ay, csry->row_offsets->data().get(), csry->column_indices->data().get())); 3919 PetscCall(PetscLogGpuFlops(x->nz + y->nz)); 3920 PetscCall(PetscLogGpuTimeEnd()); 3921 #endif 3922 PetscCallCUSPARSE(cusparseSetPointerMode(cy->handle, CUSPARSE_POINTER_MODE_DEVICE)); 3923 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3924 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3925 } else if (str == SAME_NONZERO_PATTERN) { 3926 cublasHandle_t cublasv2handle; 3927 PetscBLASInt one = 1, bnz = 1; 3928 3929 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(X, &ax)); 3930 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3931 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3932 PetscCall(PetscBLASIntCast(x->nz, &bnz)); 3933 PetscCall(PetscLogGpuTimeBegin()); 3934 PetscCallCUBLAS(cublasXaxpy(cublasv2handle, bnz, &a, ax, one, ay, one)); 3935 PetscCall(PetscLogGpuFlops(2.0 * bnz)); 3936 PetscCall(PetscLogGpuTimeEnd()); 3937 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(X, &ax)); 3938 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3939 } else { 3940 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(Y, PETSC_FALSE)); 3941 PetscCall(MatAXPY_SeqAIJ(Y, a, X, str)); 3942 } 3943 PetscFunctionReturn(PETSC_SUCCESS); 3944 } 3945 3946 static PetscErrorCode MatScale_SeqAIJCUSPARSE(Mat Y, PetscScalar a) 3947 { 3948 Mat_SeqAIJ *y = (Mat_SeqAIJ *)Y->data; 3949 PetscScalar *ay; 3950 cublasHandle_t cublasv2handle; 3951 PetscBLASInt one = 1, bnz = 1; 3952 3953 PetscFunctionBegin; 3954 PetscCall(MatSeqAIJCUSPARSEGetArray(Y, &ay)); 3955 PetscCall(PetscCUBLASGetHandle(&cublasv2handle)); 3956 PetscCall(PetscBLASIntCast(y->nz, &bnz)); 3957 PetscCall(PetscLogGpuTimeBegin()); 3958 PetscCallCUBLAS(cublasXscal(cublasv2handle, bnz, &a, ay, one)); 3959 PetscCall(PetscLogGpuFlops(bnz)); 3960 PetscCall(PetscLogGpuTimeEnd()); 3961 PetscCall(MatSeqAIJCUSPARSERestoreArray(Y, &ay)); 3962 PetscFunctionReturn(PETSC_SUCCESS); 3963 } 3964 3965 static PetscErrorCode MatZeroEntries_SeqAIJCUSPARSE(Mat A) 3966 { 3967 PetscBool gpu = PETSC_FALSE; 3968 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3969 3970 PetscFunctionBegin; 3971 if (A->factortype == MAT_FACTOR_NONE) { 3972 Mat_SeqAIJCUSPARSE *spptr = (Mat_SeqAIJCUSPARSE *)A->spptr; 3973 if (spptr->mat) { 3974 CsrMatrix *matrix = (CsrMatrix *)spptr->mat->mat; 3975 if (matrix->values) { 3976 gpu = PETSC_TRUE; 3977 thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3978 } 3979 } 3980 if (spptr->matTranspose) { 3981 CsrMatrix *matrix = (CsrMatrix *)spptr->matTranspose->mat; 3982 if (matrix->values) thrust::fill(thrust::device, matrix->values->begin(), matrix->values->end(), 0.); 3983 } 3984 } 3985 if (gpu) A->offloadmask = PETSC_OFFLOAD_GPU; 3986 else { 3987 PetscCall(PetscArrayzero(a->a, a->i[A->rmap->n])); 3988 A->offloadmask = PETSC_OFFLOAD_CPU; 3989 } 3990 PetscFunctionReturn(PETSC_SUCCESS); 3991 } 3992 3993 static PetscErrorCode MatGetCurrentMemType_SeqAIJCUSPARSE(PETSC_UNUSED Mat A, PetscMemType *m) 3994 { 3995 PetscFunctionBegin; 3996 *m = PETSC_MEMTYPE_CUDA; 3997 PetscFunctionReturn(PETSC_SUCCESS); 3998 } 3999 4000 static PetscErrorCode MatBindToCPU_SeqAIJCUSPARSE(Mat A, PetscBool flg) 4001 { 4002 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4003 4004 PetscFunctionBegin; 4005 if (A->factortype != MAT_FACTOR_NONE) { 4006 A->boundtocpu = flg; 4007 PetscFunctionReturn(PETSC_SUCCESS); 4008 } 4009 if (flg) { 4010 PetscCall(MatSeqAIJCUSPARSECopyFromGPU(A)); 4011 4012 A->ops->scale = MatScale_SeqAIJ; 4013 A->ops->getdiagonal = MatGetDiagonal_SeqAIJ; 4014 A->ops->axpy = MatAXPY_SeqAIJ; 4015 A->ops->zeroentries = MatZeroEntries_SeqAIJ; 4016 A->ops->mult = MatMult_SeqAIJ; 4017 A->ops->multadd = MatMultAdd_SeqAIJ; 4018 A->ops->multtranspose = MatMultTranspose_SeqAIJ; 4019 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJ; 4020 A->ops->multhermitiantranspose = NULL; 4021 A->ops->multhermitiantransposeadd = NULL; 4022 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJ; 4023 A->ops->getcurrentmemtype = NULL; 4024 PetscCall(PetscMemzero(a->ops, sizeof(Mat_SeqAIJOps))); 4025 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", NULL)); 4026 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", NULL)); 4027 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", NULL)); 4028 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 4029 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 4030 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", NULL)); 4031 } else { 4032 A->ops->scale = MatScale_SeqAIJCUSPARSE; 4033 A->ops->getdiagonal = MatGetDiagonal_SeqAIJCUSPARSE; 4034 A->ops->axpy = MatAXPY_SeqAIJCUSPARSE; 4035 A->ops->zeroentries = MatZeroEntries_SeqAIJCUSPARSE; 4036 A->ops->mult = MatMult_SeqAIJCUSPARSE; 4037 A->ops->multadd = MatMultAdd_SeqAIJCUSPARSE; 4038 A->ops->multtranspose = MatMultTranspose_SeqAIJCUSPARSE; 4039 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJCUSPARSE; 4040 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJCUSPARSE; 4041 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJCUSPARSE; 4042 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJCUSPARSE; 4043 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4044 a->ops->getarray = MatSeqAIJGetArray_SeqAIJCUSPARSE; 4045 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJCUSPARSE; 4046 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJCUSPARSE; 4047 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJCUSPARSE; 4048 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJCUSPARSE; 4049 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJCUSPARSE; 4050 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJCUSPARSE; 4051 4052 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSeqAIJCopySubArray_C", MatSeqAIJCopySubArray_SeqAIJCUSPARSE)); 4053 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdensecuda_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4054 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqdense_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4055 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJCUSPARSE)); 4056 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJCUSPARSE)); 4057 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatProductSetFromOptions_seqaijcusparse_seqaijcusparse_C", MatProductSetFromOptions_SeqAIJCUSPARSE)); 4058 } 4059 A->boundtocpu = flg; 4060 if (flg && a->inode.size_csr) { 4061 a->inode.use = PETSC_TRUE; 4062 } else { 4063 a->inode.use = PETSC_FALSE; 4064 } 4065 PetscFunctionReturn(PETSC_SUCCESS); 4066 } 4067 4068 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJCUSPARSE(Mat A, MatType, MatReuse reuse, Mat *newmat) 4069 { 4070 Mat B; 4071 4072 PetscFunctionBegin; 4073 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); /* first use of CUSPARSE may be via MatConvert */ 4074 if (reuse == MAT_INITIAL_MATRIX) { 4075 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 4076 } else if (reuse == MAT_REUSE_MATRIX) { 4077 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); 4078 } 4079 B = *newmat; 4080 4081 PetscCall(PetscFree(B->defaultvectype)); 4082 PetscCall(PetscStrallocpy(VECCUDA, &B->defaultvectype)); 4083 4084 if (reuse != MAT_REUSE_MATRIX && !B->spptr) { 4085 if (B->factortype == MAT_FACTOR_NONE) { 4086 Mat_SeqAIJCUSPARSE *spptr; 4087 PetscCall(PetscNew(&spptr)); 4088 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4089 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4090 spptr->format = MAT_CUSPARSE_CSR; 4091 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4092 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4093 spptr->spmvAlg = CUSPARSE_SPMV_CSR_ALG1; /* default, since we only support csr */ 4094 #else 4095 spptr->spmvAlg = CUSPARSE_CSRMV_ALG1; /* default, since we only support csr */ 4096 #endif 4097 spptr->spmmAlg = CUSPARSE_SPMM_CSR_ALG1; /* default, only support column-major dense matrix B */ 4098 spptr->csr2cscAlg = CUSPARSE_CSR2CSC_ALG1; 4099 #endif 4100 B->spptr = spptr; 4101 } else { 4102 Mat_SeqAIJCUSPARSETriFactors *spptr; 4103 4104 PetscCall(PetscNew(&spptr)); 4105 PetscCallCUSPARSE(cusparseCreate(&spptr->handle)); 4106 PetscCallCUSPARSE(cusparseSetStream(spptr->handle, PetscDefaultCudaStream)); 4107 B->spptr = spptr; 4108 } 4109 B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; 4110 } 4111 B->ops->assemblyend = MatAssemblyEnd_SeqAIJCUSPARSE; 4112 B->ops->destroy = MatDestroy_SeqAIJCUSPARSE; 4113 B->ops->setoption = MatSetOption_SeqAIJCUSPARSE; 4114 B->ops->setfromoptions = MatSetFromOptions_SeqAIJCUSPARSE; 4115 B->ops->bindtocpu = MatBindToCPU_SeqAIJCUSPARSE; 4116 B->ops->duplicate = MatDuplicate_SeqAIJCUSPARSE; 4117 B->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJCUSPARSE; 4118 4119 PetscCall(MatBindToCPU_SeqAIJCUSPARSE(B, PETSC_FALSE)); 4120 PetscCall(PetscObjectChangeTypeName((PetscObject)B, MATSEQAIJCUSPARSE)); 4121 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_SeqAIJCUSPARSE)); 4122 #if defined(PETSC_HAVE_HYPRE) 4123 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatConvert_seqaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 4124 #endif 4125 PetscCall(PetscObjectComposeFunction((PetscObject)B, "MatCUSPARSESetUseCPUSolve_C", MatCUSPARSESetUseCPUSolve_SeqAIJCUSPARSE)); 4126 PetscFunctionReturn(PETSC_SUCCESS); 4127 } 4128 4129 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJCUSPARSE(Mat B) 4130 { 4131 PetscFunctionBegin; 4132 PetscCall(MatCreate_SeqAIJ(B)); 4133 PetscCall(MatConvert_SeqAIJ_SeqAIJCUSPARSE(B, MATSEQAIJCUSPARSE, MAT_INPLACE_MATRIX, &B)); 4134 PetscFunctionReturn(PETSC_SUCCESS); 4135 } 4136 4137 /*MC 4138 MATSEQAIJCUSPARSE - MATAIJCUSPARSE = "(seq)aijcusparse" - A matrix type to be used for sparse matrices on NVIDIA GPUs. 4139 4140 Options Database Keys: 4141 + -mat_type aijcusparse - Sets the matrix type to "seqaijcusparse" during a call to `MatSetFromOptions()` 4142 . -mat_cusparse_storage_format csr - Sets the storage format of matrices (for `MatMult()` and factors in `MatSolve()`). 4143 Other options include ell (ellpack) or hyb (hybrid). 4144 . -mat_cusparse_mult_storage_format csr - Sets the storage format of matrices (for `MatMult()`). Other options include ell (ellpack) or hyb (hybrid). 4145 - -mat_cusparse_use_cpu_solve - Performs the `MatSolve()` on the CPU 4146 4147 Level: beginner 4148 4149 Notes: 4150 These matrices can be in either CSR, ELL, or HYB format. 4151 4152 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library. 4153 4154 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens 4155 if some integer values passed in do not fit in `int`. 4156 4157 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetUseCPUSolve()`, `MATAIJCUSPARSE`, `MatCreateAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 4158 M*/ 4159 4160 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_CUSPARSE(void) 4161 { 4162 PetscFunctionBegin; 4163 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_LU, MatGetFactor_seqaijcusparse_cusparse)); 4164 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_CHOLESKY, MatGetFactor_seqaijcusparse_cusparse)); 4165 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ILU, MatGetFactor_seqaijcusparse_cusparse)); 4166 PetscCall(MatSolverTypeRegister(MATSOLVERCUSPARSE, MATSEQAIJCUSPARSE, MAT_FACTOR_ICC, MatGetFactor_seqaijcusparse_cusparse)); 4167 PetscFunctionReturn(PETSC_SUCCESS); 4168 } 4169 4170 static PetscErrorCode MatSeqAIJCUSPARSE_Destroy(Mat mat) 4171 { 4172 Mat_SeqAIJCUSPARSE *cusp = static_cast<Mat_SeqAIJCUSPARSE *>(mat->spptr); 4173 4174 PetscFunctionBegin; 4175 if (cusp) { 4176 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->mat, cusp->format)); 4177 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4178 delete cusp->workVector; 4179 delete cusp->rowoffsets_gpu; 4180 delete cusp->csr2csc_i; 4181 delete cusp->coords; 4182 if (cusp->handle) PetscCallCUSPARSE(cusparseDestroy(cusp->handle)); 4183 PetscCall(PetscFree(mat->spptr)); 4184 } 4185 PetscFunctionReturn(PETSC_SUCCESS); 4186 } 4187 4188 static PetscErrorCode CsrMatrix_Destroy(CsrMatrix **mat) 4189 { 4190 PetscFunctionBegin; 4191 if (*mat) { 4192 delete (*mat)->values; 4193 delete (*mat)->column_indices; 4194 delete (*mat)->row_offsets; 4195 delete *mat; 4196 *mat = 0; 4197 } 4198 PetscFunctionReturn(PETSC_SUCCESS); 4199 } 4200 4201 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4202 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSETriFactorStruct **trifactor) 4203 { 4204 PetscFunctionBegin; 4205 if (*trifactor) { 4206 if ((*trifactor)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*trifactor)->descr)); 4207 if ((*trifactor)->solveInfo) PetscCallCUSPARSE(cusparseDestroyCsrsvInfo((*trifactor)->solveInfo)); 4208 PetscCall(CsrMatrix_Destroy(&(*trifactor)->csrMat)); 4209 if ((*trifactor)->solveBuffer) PetscCallCUDA(cudaFree((*trifactor)->solveBuffer)); 4210 if ((*trifactor)->AA_h) PetscCallCUDA(cudaFreeHost((*trifactor)->AA_h)); 4211 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4212 if ((*trifactor)->csr2cscBuffer) PetscCallCUDA(cudaFree((*trifactor)->csr2cscBuffer)); 4213 #endif 4214 PetscCall(PetscFree(*trifactor)); 4215 } 4216 PetscFunctionReturn(PETSC_SUCCESS); 4217 } 4218 #endif 4219 4220 static PetscErrorCode MatSeqAIJCUSPARSEMultStruct_Destroy(Mat_SeqAIJCUSPARSEMultStruct **matstruct, MatCUSPARSEStorageFormat format) 4221 { 4222 CsrMatrix *mat; 4223 4224 PetscFunctionBegin; 4225 if (*matstruct) { 4226 if ((*matstruct)->mat) { 4227 if (format == MAT_CUSPARSE_ELL || format == MAT_CUSPARSE_HYB) { 4228 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4229 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_CUSPARSE_ELL and MAT_CUSPARSE_HYB are not supported since CUDA-11.0"); 4230 #else 4231 cusparseHybMat_t hybMat = (cusparseHybMat_t)(*matstruct)->mat; 4232 PetscCallCUSPARSE(cusparseDestroyHybMat(hybMat)); 4233 #endif 4234 } else { 4235 mat = (CsrMatrix *)(*matstruct)->mat; 4236 PetscCall(CsrMatrix_Destroy(&mat)); 4237 } 4238 } 4239 if ((*matstruct)->descr) PetscCallCUSPARSE(cusparseDestroyMatDescr((*matstruct)->descr)); 4240 delete (*matstruct)->cprowIndices; 4241 if ((*matstruct)->alpha_one) PetscCallCUDA(cudaFree((*matstruct)->alpha_one)); 4242 if ((*matstruct)->beta_zero) PetscCallCUDA(cudaFree((*matstruct)->beta_zero)); 4243 if ((*matstruct)->beta_one) PetscCallCUDA(cudaFree((*matstruct)->beta_one)); 4244 4245 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4246 Mat_SeqAIJCUSPARSEMultStruct *mdata = *matstruct; 4247 if (mdata->matDescr) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr)); 4248 4249 for (int i = 0; i < 3; i++) { 4250 if (mdata->cuSpMV[i].initialized) { 4251 PetscCallCUDA(cudaFree(mdata->cuSpMV[i].spmvBuffer)); 4252 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecXDescr)); 4253 PetscCallCUSPARSE(cusparseDestroyDnVec(mdata->cuSpMV[i].vecYDescr)); 4254 #if PETSC_PKG_CUDA_VERSION_GE(12, 4, 0) 4255 if (mdata->matDescr_SpMV[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMV[i])); 4256 if (mdata->matDescr_SpMM[i]) PetscCallCUSPARSE(cusparseDestroySpMat(mdata->matDescr_SpMM[i])); 4257 #endif 4258 } 4259 } 4260 #endif 4261 delete *matstruct; 4262 *matstruct = NULL; 4263 } 4264 PetscFunctionReturn(PETSC_SUCCESS); 4265 } 4266 4267 PetscErrorCode MatSeqAIJCUSPARSETriFactors_Reset(Mat_SeqAIJCUSPARSETriFactors_p *trifactors) 4268 { 4269 Mat_SeqAIJCUSPARSETriFactors *fs = *trifactors; 4270 4271 PetscFunctionBegin; 4272 if (fs) { 4273 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 4274 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtr)); 4275 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtr)); 4276 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->loTriFactorPtrTranspose)); 4277 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&fs->upTriFactorPtrTranspose)); 4278 delete fs->workVector; 4279 fs->workVector = NULL; 4280 #endif 4281 delete fs->rpermIndices; 4282 delete fs->cpermIndices; 4283 fs->rpermIndices = NULL; 4284 fs->cpermIndices = NULL; 4285 fs->init_dev_prop = PETSC_FALSE; 4286 #if PETSC_PKG_CUDA_VERSION_GE(11, 4, 0) 4287 PetscCallCUDA(cudaFree(fs->csrRowPtr)); 4288 PetscCallCUDA(cudaFree(fs->csrColIdx)); 4289 PetscCallCUDA(cudaFree(fs->csrRowPtr32)); 4290 PetscCallCUDA(cudaFree(fs->csrColIdx32)); 4291 PetscCallCUDA(cudaFree(fs->csrVal)); 4292 PetscCallCUDA(cudaFree(fs->diag)); 4293 PetscCallCUDA(cudaFree(fs->X)); 4294 PetscCallCUDA(cudaFree(fs->Y)); 4295 // PetscCallCUDA(cudaFree(fs->factBuffer_M)); /* No needed since factBuffer_M shares with one of spsvBuffer_L/U */ 4296 PetscCallCUDA(cudaFree(fs->spsvBuffer_L)); 4297 PetscCallCUDA(cudaFree(fs->spsvBuffer_U)); 4298 PetscCallCUDA(cudaFree(fs->spsvBuffer_Lt)); 4299 PetscCallCUDA(cudaFree(fs->spsvBuffer_Ut)); 4300 PetscCallCUSPARSE(cusparseDestroyMatDescr(fs->matDescr_M)); 4301 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_L)); 4302 PetscCallCUSPARSE(cusparseDestroySpMat(fs->spMatDescr_U)); 4303 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_L)); 4304 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Lt)); 4305 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_U)); 4306 PetscCallCUSPARSE(cusparseSpSV_destroyDescr(fs->spsvDescr_Ut)); 4307 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_X)); 4308 PetscCallCUSPARSE(cusparseDestroyDnVec(fs->dnVecDescr_Y)); 4309 PetscCallCUSPARSE(cusparseDestroyCsrilu02Info(fs->ilu0Info_M)); 4310 PetscCallCUSPARSE(cusparseDestroyCsric02Info(fs->ic0Info_M)); 4311 PetscCall(PetscFree(fs->csrRowPtr_h)); 4312 PetscCall(PetscFree(fs->csrVal_h)); 4313 PetscCall(PetscFree(fs->diag_h)); 4314 fs->createdTransposeSpSVDescr = PETSC_FALSE; 4315 fs->updatedTransposeSpSVAnalysis = PETSC_FALSE; 4316 #endif 4317 } 4318 PetscFunctionReturn(PETSC_SUCCESS); 4319 } 4320 4321 static PetscErrorCode MatSeqAIJCUSPARSETriFactors_Destroy(Mat_SeqAIJCUSPARSETriFactors **trifactors) 4322 { 4323 PetscFunctionBegin; 4324 if (*trifactors) { 4325 PetscCall(MatSeqAIJCUSPARSETriFactors_Reset(trifactors)); 4326 PetscCallCUSPARSE(cusparseDestroy((*trifactors)->handle)); 4327 PetscCall(PetscFree(*trifactors)); 4328 } 4329 PetscFunctionReturn(PETSC_SUCCESS); 4330 } 4331 4332 struct IJCompare { 4333 __host__ __device__ inline bool operator()(const thrust::tuple<PetscInt, PetscInt> &t1, const thrust::tuple<PetscInt, PetscInt> &t2) 4334 { 4335 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4336 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4337 return false; 4338 } 4339 }; 4340 4341 static PetscErrorCode MatSeqAIJCUSPARSEInvalidateTranspose(Mat A, PetscBool destroy) 4342 { 4343 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4344 4345 PetscFunctionBegin; 4346 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4347 if (!cusp) PetscFunctionReturn(PETSC_SUCCESS); 4348 if (destroy) { 4349 PetscCall(MatSeqAIJCUSPARSEMultStruct_Destroy(&cusp->matTranspose, cusp->format)); 4350 delete cusp->csr2csc_i; 4351 cusp->csr2csc_i = NULL; 4352 } 4353 A->transupdated = PETSC_FALSE; 4354 PetscFunctionReturn(PETSC_SUCCESS); 4355 } 4356 4357 static PetscErrorCode MatCOOStructDestroy_SeqAIJCUSPARSE(void **data) 4358 { 4359 MatCOOStruct_SeqAIJ *coo = (MatCOOStruct_SeqAIJ *)*data; 4360 4361 PetscFunctionBegin; 4362 PetscCallCUDA(cudaFree(coo->perm)); 4363 PetscCallCUDA(cudaFree(coo->jmap)); 4364 PetscCall(PetscFree(coo)); 4365 PetscFunctionReturn(PETSC_SUCCESS); 4366 } 4367 4368 static PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 4369 { 4370 PetscBool dev_ij = PETSC_FALSE; 4371 PetscMemType mtype = PETSC_MEMTYPE_HOST; 4372 PetscInt *i, *j; 4373 PetscContainer container_h; 4374 MatCOOStruct_SeqAIJ *coo_h, *coo_d; 4375 4376 PetscFunctionBegin; 4377 PetscCall(PetscGetMemType(coo_i, &mtype)); 4378 if (PetscMemTypeDevice(mtype)) { 4379 dev_ij = PETSC_TRUE; 4380 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j)); 4381 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4382 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4383 } else { 4384 i = coo_i; 4385 j = coo_j; 4386 } 4387 4388 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, i, j)); 4389 if (dev_ij) PetscCall(PetscFree2(i, j)); 4390 mat->offloadmask = PETSC_OFFLOAD_CPU; 4391 // Create the GPU memory 4392 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mat)); 4393 4394 // Copy the COO struct to device 4395 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 4396 PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 4397 PetscCall(PetscMalloc1(1, &coo_d)); 4398 *coo_d = *coo_h; // do a shallow copy and then amend some fields that need to be different 4399 PetscCallCUDA(cudaMalloc((void **)&coo_d->jmap, (coo_h->nz + 1) * sizeof(PetscCount))); 4400 PetscCallCUDA(cudaMemcpy(coo_d->jmap, coo_h->jmap, (coo_h->nz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4401 PetscCallCUDA(cudaMalloc((void **)&coo_d->perm, coo_h->Atot * sizeof(PetscCount))); 4402 PetscCallCUDA(cudaMemcpy(coo_d->perm, coo_h->perm, coo_h->Atot * sizeof(PetscCount), cudaMemcpyHostToDevice)); 4403 4404 // Put the COO struct in a container and then attach that to the matrix 4405 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJCUSPARSE)); 4406 PetscFunctionReturn(PETSC_SUCCESS); 4407 } 4408 4409 __global__ static void MatAddCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount jmap[], const PetscCount perm[], InsertMode imode, PetscScalar a[]) 4410 { 4411 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 4412 const PetscCount grid_size = gridDim.x * blockDim.x; 4413 for (; i < nnz; i += grid_size) { 4414 PetscScalar sum = 0.0; 4415 for (PetscCount k = jmap[i]; k < jmap[i + 1]; k++) sum += kv[perm[k]]; 4416 a[i] = (imode == INSERT_VALUES ? 0.0 : a[i]) + sum; 4417 } 4418 } 4419 4420 static PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 4421 { 4422 Mat_SeqAIJ *seq = (Mat_SeqAIJ *)A->data; 4423 Mat_SeqAIJCUSPARSE *dev = (Mat_SeqAIJCUSPARSE *)A->spptr; 4424 PetscCount Annz = seq->nz; 4425 PetscMemType memtype; 4426 const PetscScalar *v1 = v; 4427 PetscScalar *Aa; 4428 PetscContainer container; 4429 MatCOOStruct_SeqAIJ *coo; 4430 4431 PetscFunctionBegin; 4432 if (!dev->mat) PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4433 4434 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 4435 PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 4436 4437 PetscCall(PetscGetMemType(v, &memtype)); 4438 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 4439 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar))); 4440 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4441 } 4442 4443 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); 4444 else PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); 4445 4446 PetscCall(PetscLogGpuTimeBegin()); 4447 if (Annz) { 4448 MatAddCOOValues<<<((int)(Annz + 255) / 256), 256>>>(v1, Annz, coo->jmap, coo->perm, imode, Aa); 4449 PetscCallCUDA(cudaPeekAtLastError()); 4450 } 4451 PetscCall(PetscLogGpuTimeEnd()); 4452 4453 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 4454 else PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 4455 4456 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 4457 PetscFunctionReturn(PETSC_SUCCESS); 4458 } 4459 4460 /*@C 4461 MatSeqAIJCUSPARSEGetIJ - returns the device row storage `i` and `j` indices for `MATSEQAIJCUSPARSE` matrices. 4462 4463 Not Collective 4464 4465 Input Parameters: 4466 + A - the matrix 4467 - compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4468 4469 Output Parameters: 4470 + i - the CSR row pointers, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4471 - j - the CSR column indices, these are always `int` even when PETSc is configured with `--with-64-bit-indices` 4472 4473 Level: developer 4474 4475 Note: 4476 When compressed is true, the CSR structure does not contain empty rows 4477 4478 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSERestoreIJ()`, `MatSeqAIJCUSPARSEGetArrayRead()` 4479 @*/ 4480 PetscErrorCode MatSeqAIJCUSPARSEGetIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4481 { 4482 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4483 CsrMatrix *csr; 4484 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4485 4486 PetscFunctionBegin; 4487 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4488 if (!i || !j) PetscFunctionReturn(PETSC_SUCCESS); 4489 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4490 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4491 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4492 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4493 csr = (CsrMatrix *)cusp->mat->mat; 4494 if (i) { 4495 if (!compressed && a->compressedrow.use) { /* need full row offset */ 4496 if (!cusp->rowoffsets_gpu) { 4497 cusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4498 cusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4499 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4500 } 4501 *i = cusp->rowoffsets_gpu->data().get(); 4502 } else *i = csr->row_offsets->data().get(); 4503 } 4504 if (j) *j = csr->column_indices->data().get(); 4505 PetscFunctionReturn(PETSC_SUCCESS); 4506 } 4507 4508 /*@C 4509 MatSeqAIJCUSPARSERestoreIJ - restore the device row storage `i` and `j` indices obtained with `MatSeqAIJCUSPARSEGetIJ()` 4510 4511 Not Collective 4512 4513 Input Parameters: 4514 + A - the matrix 4515 . compressed - `PETSC_TRUE` or `PETSC_FALSE` indicating the matrix data structure should be always returned in compressed form 4516 . i - the CSR row pointers 4517 - j - the CSR column indices 4518 4519 Level: developer 4520 4521 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetIJ()` 4522 @*/ 4523 PetscErrorCode MatSeqAIJCUSPARSERestoreIJ(Mat A, PetscBool compressed, const int **i, const int **j) 4524 { 4525 PetscFunctionBegin; 4526 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4527 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4528 if (i) *i = NULL; 4529 if (j) *j = NULL; 4530 (void)compressed; 4531 PetscFunctionReturn(PETSC_SUCCESS); 4532 } 4533 4534 /*@C 4535 MatSeqAIJCUSPARSEGetArrayRead - gives read-only access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix nonzero entries are stored 4536 4537 Not Collective 4538 4539 Input Parameter: 4540 . A - a `MATSEQAIJCUSPARSE` matrix 4541 4542 Output Parameter: 4543 . a - pointer to the device data 4544 4545 Level: developer 4546 4547 Note: 4548 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4549 4550 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArrayRead()` 4551 @*/ 4552 PetscErrorCode MatSeqAIJCUSPARSEGetArrayRead(Mat A, const PetscScalar **a) 4553 { 4554 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4555 CsrMatrix *csr; 4556 4557 PetscFunctionBegin; 4558 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4559 PetscAssertPointer(a, 2); 4560 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4561 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4562 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4563 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4564 csr = (CsrMatrix *)cusp->mat->mat; 4565 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4566 *a = csr->values->data().get(); 4567 PetscFunctionReturn(PETSC_SUCCESS); 4568 } 4569 4570 /*@C 4571 MatSeqAIJCUSPARSERestoreArrayRead - restore the read-only access array obtained from `MatSeqAIJCUSPARSEGetArrayRead()` 4572 4573 Not Collective 4574 4575 Input Parameters: 4576 + A - a `MATSEQAIJCUSPARSE` matrix 4577 - a - pointer to the device data 4578 4579 Level: developer 4580 4581 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()` 4582 @*/ 4583 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayRead(Mat A, const PetscScalar **a) 4584 { 4585 PetscFunctionBegin; 4586 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4587 PetscAssertPointer(a, 2); 4588 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4589 *a = NULL; 4590 PetscFunctionReturn(PETSC_SUCCESS); 4591 } 4592 4593 /*@C 4594 MatSeqAIJCUSPARSEGetArray - gives read-write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4595 4596 Not Collective 4597 4598 Input Parameter: 4599 . A - a `MATSEQAIJCUSPARSE` matrix 4600 4601 Output Parameter: 4602 . a - pointer to the device data 4603 4604 Level: developer 4605 4606 Note: 4607 Will trigger host-to-device copies if the most up-to-date matrix data is on the host 4608 4609 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSEGetArrayWrite()`, `MatSeqAIJCUSPARSERestoreArray()` 4610 @*/ 4611 PetscErrorCode MatSeqAIJCUSPARSEGetArray(Mat A, PetscScalar **a) 4612 { 4613 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4614 CsrMatrix *csr; 4615 4616 PetscFunctionBegin; 4617 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4618 PetscAssertPointer(a, 2); 4619 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4620 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4621 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4622 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4623 csr = (CsrMatrix *)cusp->mat->mat; 4624 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4625 *a = csr->values->data().get(); 4626 A->offloadmask = PETSC_OFFLOAD_GPU; 4627 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4628 PetscFunctionReturn(PETSC_SUCCESS); 4629 } 4630 /*@C 4631 MatSeqAIJCUSPARSERestoreArray - restore the read-write access array obtained from `MatSeqAIJCUSPARSEGetArray()` 4632 4633 Not Collective 4634 4635 Input Parameters: 4636 + A - a `MATSEQAIJCUSPARSE` matrix 4637 - a - pointer to the device data 4638 4639 Level: developer 4640 4641 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()` 4642 @*/ 4643 PetscErrorCode MatSeqAIJCUSPARSERestoreArray(Mat A, PetscScalar **a) 4644 { 4645 PetscFunctionBegin; 4646 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4647 PetscAssertPointer(a, 2); 4648 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4649 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4650 *a = NULL; 4651 PetscFunctionReturn(PETSC_SUCCESS); 4652 } 4653 4654 /*@C 4655 MatSeqAIJCUSPARSEGetArrayWrite - gives write access to the array where the device data for a `MATSEQAIJCUSPARSE` matrix is stored 4656 4657 Not Collective 4658 4659 Input Parameter: 4660 . A - a `MATSEQAIJCUSPARSE` matrix 4661 4662 Output Parameter: 4663 . a - pointer to the device data 4664 4665 Level: developer 4666 4667 Note: 4668 Does not trigger any host to device copies. 4669 4670 It marks the data GPU valid so users must set all the values in `a` to ensure out-of-date data is not considered current 4671 4672 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArray()`, `MatSeqAIJCUSPARSEGetArrayRead()`, `MatSeqAIJCUSPARSERestoreArrayWrite()` 4673 @*/ 4674 PetscErrorCode MatSeqAIJCUSPARSEGetArrayWrite(Mat A, PetscScalar **a) 4675 { 4676 Mat_SeqAIJCUSPARSE *cusp = (Mat_SeqAIJCUSPARSE *)A->spptr; 4677 CsrMatrix *csr; 4678 4679 PetscFunctionBegin; 4680 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4681 PetscAssertPointer(a, 2); 4682 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4683 PetscCheck(cusp->format != MAT_CUSPARSE_ELL && cusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4684 PetscCheck(cusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4685 csr = (CsrMatrix *)cusp->mat->mat; 4686 PetscCheck(csr->values, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing CUDA memory"); 4687 *a = csr->values->data().get(); 4688 A->offloadmask = PETSC_OFFLOAD_GPU; 4689 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(A, PETSC_FALSE)); 4690 PetscFunctionReturn(PETSC_SUCCESS); 4691 } 4692 4693 /*@C 4694 MatSeqAIJCUSPARSERestoreArrayWrite - restore the write-only access array obtained from `MatSeqAIJCUSPARSEGetArrayWrite()` 4695 4696 Not Collective 4697 4698 Input Parameters: 4699 + A - a `MATSEQAIJCUSPARSE` matrix 4700 - a - pointer to the device data 4701 4702 Level: developer 4703 4704 .seealso: [](ch_matrices), `Mat`, `MatSeqAIJCUSPARSEGetArrayWrite()` 4705 @*/ 4706 PetscErrorCode MatSeqAIJCUSPARSERestoreArrayWrite(Mat A, PetscScalar **a) 4707 { 4708 PetscFunctionBegin; 4709 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4710 PetscAssertPointer(a, 2); 4711 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4712 PetscCall(PetscObjectStateIncrease((PetscObject)A)); 4713 *a = NULL; 4714 PetscFunctionReturn(PETSC_SUCCESS); 4715 } 4716 4717 struct IJCompare4 { 4718 __host__ __device__ inline bool operator()(const thrust::tuple<int, int, PetscScalar, int> &t1, const thrust::tuple<int, int, PetscScalar, int> &t2) 4719 { 4720 if (thrust::get<0>(t1) < thrust::get<0>(t2)) return true; 4721 if (thrust::get<0>(t1) == thrust::get<0>(t2)) return thrust::get<1>(t1) < thrust::get<1>(t2); 4722 return false; 4723 } 4724 }; 4725 4726 struct Shift { 4727 int _shift; 4728 4729 Shift(int shift) : _shift(shift) { } 4730 __host__ __device__ inline int operator()(const int &c) { return c + _shift; } 4731 }; 4732 4733 /* merges two SeqAIJCUSPARSE matrices A, B by concatenating their rows. [A';B']' operation in MATLAB notation */ 4734 PetscErrorCode MatSeqAIJCUSPARSEMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 4735 { 4736 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)B->data, *c; 4737 Mat_SeqAIJCUSPARSE *Acusp = (Mat_SeqAIJCUSPARSE *)A->spptr, *Bcusp = (Mat_SeqAIJCUSPARSE *)B->spptr, *Ccusp; 4738 Mat_SeqAIJCUSPARSEMultStruct *Cmat; 4739 CsrMatrix *Acsr, *Bcsr, *Ccsr; 4740 PetscInt Annz, Bnnz; 4741 cusparseStatus_t stat; 4742 PetscInt i, m, n, zero = 0; 4743 4744 PetscFunctionBegin; 4745 PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 4746 PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 4747 PetscAssertPointer(C, 4); 4748 PetscCheckTypeName(A, MATSEQAIJCUSPARSE); 4749 PetscCheckTypeName(B, MATSEQAIJCUSPARSE); 4750 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 4751 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 4752 PetscCheck(Acusp->format != MAT_CUSPARSE_ELL && Acusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4753 PetscCheck(Bcusp->format != MAT_CUSPARSE_ELL && Bcusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4754 if (reuse == MAT_INITIAL_MATRIX) { 4755 m = A->rmap->n; 4756 n = A->cmap->n + B->cmap->n; 4757 PetscCall(MatCreate(PETSC_COMM_SELF, C)); 4758 PetscCall(MatSetSizes(*C, m, n, m, n)); 4759 PetscCall(MatSetType(*C, MATSEQAIJCUSPARSE)); 4760 c = (Mat_SeqAIJ *)(*C)->data; 4761 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4762 Cmat = new Mat_SeqAIJCUSPARSEMultStruct; 4763 Ccsr = new CsrMatrix; 4764 Cmat->cprowIndices = NULL; 4765 c->compressedrow.use = PETSC_FALSE; 4766 c->compressedrow.nrows = 0; 4767 c->compressedrow.i = NULL; 4768 c->compressedrow.rindex = NULL; 4769 Ccusp->workVector = NULL; 4770 Ccusp->nrows = m; 4771 Ccusp->mat = Cmat; 4772 Ccusp->mat->mat = Ccsr; 4773 Ccsr->num_rows = m; 4774 Ccsr->num_cols = n; 4775 PetscCallCUSPARSE(cusparseCreateMatDescr(&Cmat->descr)); 4776 PetscCallCUSPARSE(cusparseSetMatIndexBase(Cmat->descr, CUSPARSE_INDEX_BASE_ZERO)); 4777 PetscCallCUSPARSE(cusparseSetMatType(Cmat->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4778 PetscCallCUDA(cudaMalloc((void **)&Cmat->alpha_one, sizeof(PetscScalar))); 4779 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_zero, sizeof(PetscScalar))); 4780 PetscCallCUDA(cudaMalloc((void **)&Cmat->beta_one, sizeof(PetscScalar))); 4781 PetscCallCUDA(cudaMemcpy(Cmat->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4782 PetscCallCUDA(cudaMemcpy(Cmat->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4783 PetscCallCUDA(cudaMemcpy(Cmat->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4784 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4785 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4786 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4787 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4788 4789 Acsr = (CsrMatrix *)Acusp->mat->mat; 4790 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4791 Annz = (PetscInt)Acsr->column_indices->size(); 4792 Bnnz = (PetscInt)Bcsr->column_indices->size(); 4793 c->nz = Annz + Bnnz; 4794 Ccsr->row_offsets = new THRUSTINTARRAY32(m + 1); 4795 Ccsr->column_indices = new THRUSTINTARRAY32(c->nz); 4796 Ccsr->values = new THRUSTARRAY(c->nz); 4797 Ccsr->num_entries = c->nz; 4798 Ccusp->coords = new THRUSTINTARRAY(c->nz); 4799 if (c->nz) { 4800 auto Acoo = new THRUSTINTARRAY32(Annz); 4801 auto Bcoo = new THRUSTINTARRAY32(Bnnz); 4802 auto Ccoo = new THRUSTINTARRAY32(c->nz); 4803 THRUSTINTARRAY32 *Aroff, *Broff; 4804 4805 if (a->compressedrow.use) { /* need full row offset */ 4806 if (!Acusp->rowoffsets_gpu) { 4807 Acusp->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 4808 Acusp->rowoffsets_gpu->assign(a->i, a->i + A->rmap->n + 1); 4809 PetscCall(PetscLogCpuToGpu((A->rmap->n + 1) * sizeof(PetscInt))); 4810 } 4811 Aroff = Acusp->rowoffsets_gpu; 4812 } else Aroff = Acsr->row_offsets; 4813 if (b->compressedrow.use) { /* need full row offset */ 4814 if (!Bcusp->rowoffsets_gpu) { 4815 Bcusp->rowoffsets_gpu = new THRUSTINTARRAY32(B->rmap->n + 1); 4816 Bcusp->rowoffsets_gpu->assign(b->i, b->i + B->rmap->n + 1); 4817 PetscCall(PetscLogCpuToGpu((B->rmap->n + 1) * sizeof(PetscInt))); 4818 } 4819 Broff = Bcusp->rowoffsets_gpu; 4820 } else Broff = Bcsr->row_offsets; 4821 PetscCall(PetscLogGpuTimeBegin()); 4822 stat = cusparseXcsr2coo(Acusp->handle, Aroff->data().get(), Annz, m, Acoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4823 PetscCallCUSPARSE(stat); 4824 stat = cusparseXcsr2coo(Bcusp->handle, Broff->data().get(), Bnnz, m, Bcoo->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4825 PetscCallCUSPARSE(stat); 4826 /* Issues when using bool with large matrices on SUMMIT 10.2.89 */ 4827 auto Aperm = thrust::make_constant_iterator(1); 4828 auto Bperm = thrust::make_constant_iterator(0); 4829 #if PETSC_PKG_CUDA_VERSION_GE(10, 0, 0) 4830 auto Bcib = thrust::make_transform_iterator(Bcsr->column_indices->begin(), Shift(A->cmap->n)); 4831 auto Bcie = thrust::make_transform_iterator(Bcsr->column_indices->end(), Shift(A->cmap->n)); 4832 #else 4833 /* there are issues instantiating the merge operation using a transform iterator for the columns of B */ 4834 auto Bcib = Bcsr->column_indices->begin(); 4835 auto Bcie = Bcsr->column_indices->end(); 4836 thrust::transform(Bcib, Bcie, Bcib, Shift(A->cmap->n)); 4837 #endif 4838 auto wPerm = new THRUSTINTARRAY32(Annz + Bnnz); 4839 auto Azb = thrust::make_zip_iterator(thrust::make_tuple(Acoo->begin(), Acsr->column_indices->begin(), Acsr->values->begin(), Aperm)); 4840 auto Aze = thrust::make_zip_iterator(thrust::make_tuple(Acoo->end(), Acsr->column_indices->end(), Acsr->values->end(), Aperm)); 4841 auto Bzb = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->begin(), Bcib, Bcsr->values->begin(), Bperm)); 4842 auto Bze = thrust::make_zip_iterator(thrust::make_tuple(Bcoo->end(), Bcie, Bcsr->values->end(), Bperm)); 4843 auto Czb = thrust::make_zip_iterator(thrust::make_tuple(Ccoo->begin(), Ccsr->column_indices->begin(), Ccsr->values->begin(), wPerm->begin())); 4844 auto p1 = Ccusp->coords->begin(); 4845 auto p2 = Ccusp->coords->begin(); 4846 thrust::advance(p2, Annz); 4847 PetscCallThrust(thrust::merge(thrust::device, Azb, Aze, Bzb, Bze, Czb, IJCompare4())); 4848 #if PETSC_PKG_CUDA_VERSION_LT(10, 0, 0) 4849 thrust::transform(Bcib, Bcie, Bcib, Shift(-A->cmap->n)); 4850 #endif 4851 auto cci = thrust::make_counting_iterator(zero); 4852 auto cce = thrust::make_counting_iterator(c->nz); 4853 #if 0 //Errors on SUMMIT cuda 11.1.0 4854 PetscCallThrust(thrust::partition_copy(thrust::device,cci,cce,wPerm->begin(),p1,p2,thrust::identity<int>())); 4855 #else 4856 #if PETSC_PKG_CUDA_VERSION_LT(12, 9, 0) || PetscDefined(HAVE_THRUST) 4857 auto pred = thrust::identity<int>(); 4858 #else 4859 auto pred = cuda::std::identity(); 4860 #endif 4861 PetscCallThrust(thrust::copy_if(thrust::device, cci, cce, wPerm->begin(), p1, pred)); 4862 PetscCallThrust(thrust::remove_copy_if(thrust::device, cci, cce, wPerm->begin(), p2, pred)); 4863 #endif 4864 stat = cusparseXcoo2csr(Ccusp->handle, Ccoo->data().get(), c->nz, m, Ccsr->row_offsets->data().get(), CUSPARSE_INDEX_BASE_ZERO); 4865 PetscCallCUSPARSE(stat); 4866 PetscCall(PetscLogGpuTimeEnd()); 4867 delete wPerm; 4868 delete Acoo; 4869 delete Bcoo; 4870 delete Ccoo; 4871 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4872 stat = cusparseCreateCsr(&Cmat->matDescr, Ccsr->num_rows, Ccsr->num_cols, Ccsr->num_entries, Ccsr->row_offsets->data().get(), Ccsr->column_indices->data().get(), Ccsr->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4873 PetscCallCUSPARSE(stat); 4874 #endif 4875 if (A->form_explicit_transpose && B->form_explicit_transpose) { /* if A and B have the transpose, generate C transpose too */ 4876 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(A)); 4877 PetscCall(MatSeqAIJCUSPARSEFormExplicitTranspose(B)); 4878 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 4879 Mat_SeqAIJCUSPARSEMultStruct *CmatT = new Mat_SeqAIJCUSPARSEMultStruct; 4880 CsrMatrix *CcsrT = new CsrMatrix; 4881 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 4882 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 4883 4884 (*C)->form_explicit_transpose = PETSC_TRUE; 4885 (*C)->transupdated = PETSC_TRUE; 4886 Ccusp->rowoffsets_gpu = NULL; 4887 CmatT->cprowIndices = NULL; 4888 CmatT->mat = CcsrT; 4889 CcsrT->num_rows = n; 4890 CcsrT->num_cols = m; 4891 CcsrT->num_entries = c->nz; 4892 4893 CcsrT->row_offsets = new THRUSTINTARRAY32(n + 1); 4894 CcsrT->column_indices = new THRUSTINTARRAY32(c->nz); 4895 CcsrT->values = new THRUSTARRAY(c->nz); 4896 4897 PetscCall(PetscLogGpuTimeBegin()); 4898 auto rT = CcsrT->row_offsets->begin(); 4899 if (AT) { 4900 rT = thrust::copy(AcsrT->row_offsets->begin(), AcsrT->row_offsets->end(), rT); 4901 thrust::advance(rT, -1); 4902 } 4903 if (BT) { 4904 auto titb = thrust::make_transform_iterator(BcsrT->row_offsets->begin(), Shift(a->nz)); 4905 auto tite = thrust::make_transform_iterator(BcsrT->row_offsets->end(), Shift(a->nz)); 4906 thrust::copy(titb, tite, rT); 4907 } 4908 auto cT = CcsrT->column_indices->begin(); 4909 if (AT) cT = thrust::copy(AcsrT->column_indices->begin(), AcsrT->column_indices->end(), cT); 4910 if (BT) thrust::copy(BcsrT->column_indices->begin(), BcsrT->column_indices->end(), cT); 4911 auto vT = CcsrT->values->begin(); 4912 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 4913 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 4914 PetscCall(PetscLogGpuTimeEnd()); 4915 4916 PetscCallCUSPARSE(cusparseCreateMatDescr(&CmatT->descr)); 4917 PetscCallCUSPARSE(cusparseSetMatIndexBase(CmatT->descr, CUSPARSE_INDEX_BASE_ZERO)); 4918 PetscCallCUSPARSE(cusparseSetMatType(CmatT->descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 4919 PetscCallCUDA(cudaMalloc((void **)&CmatT->alpha_one, sizeof(PetscScalar))); 4920 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_zero, sizeof(PetscScalar))); 4921 PetscCallCUDA(cudaMalloc((void **)&CmatT->beta_one, sizeof(PetscScalar))); 4922 PetscCallCUDA(cudaMemcpy(CmatT->alpha_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4923 PetscCallCUDA(cudaMemcpy(CmatT->beta_zero, &PETSC_CUSPARSE_ZERO, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4924 PetscCallCUDA(cudaMemcpy(CmatT->beta_one, &PETSC_CUSPARSE_ONE, sizeof(PetscScalar), cudaMemcpyHostToDevice)); 4925 #if PETSC_PKG_CUDA_VERSION_GE(11, 0, 0) 4926 stat = cusparseCreateCsr(&CmatT->matDescr, CcsrT->num_rows, CcsrT->num_cols, CcsrT->num_entries, CcsrT->row_offsets->data().get(), CcsrT->column_indices->data().get(), CcsrT->values->data().get(), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, cusparse_scalartype); 4927 PetscCallCUSPARSE(stat); 4928 #endif 4929 Ccusp->matTranspose = CmatT; 4930 } 4931 } 4932 4933 c->free_a = PETSC_TRUE; 4934 PetscCall(PetscShmgetAllocateArray(c->nz, sizeof(PetscInt), (void **)&c->j)); 4935 PetscCall(PetscShmgetAllocateArray(m + 1, sizeof(PetscInt), (void **)&c->i)); 4936 c->free_ij = PETSC_TRUE; 4937 if (PetscDefined(USE_64BIT_INDICES)) { /* 32 to 64-bit conversion on the GPU and then copy to host (lazy) */ 4938 THRUSTINTARRAY ii(Ccsr->row_offsets->size()); 4939 THRUSTINTARRAY jj(Ccsr->column_indices->size()); 4940 ii = *Ccsr->row_offsets; 4941 jj = *Ccsr->column_indices; 4942 PetscCallCUDA(cudaMemcpy(c->i, ii.data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4943 PetscCallCUDA(cudaMemcpy(c->j, jj.data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4944 } else { 4945 PetscCallCUDA(cudaMemcpy(c->i, Ccsr->row_offsets->data().get(), Ccsr->row_offsets->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4946 PetscCallCUDA(cudaMemcpy(c->j, Ccsr->column_indices->data().get(), Ccsr->column_indices->size() * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 4947 } 4948 PetscCall(PetscLogGpuToCpu((Ccsr->column_indices->size() + Ccsr->row_offsets->size()) * sizeof(PetscInt))); 4949 PetscCall(PetscMalloc1(m, &c->ilen)); 4950 PetscCall(PetscMalloc1(m, &c->imax)); 4951 c->maxnz = c->nz; 4952 c->nonzerorowcnt = 0; 4953 c->rmax = 0; 4954 for (i = 0; i < m; i++) { 4955 const PetscInt nn = c->i[i + 1] - c->i[i]; 4956 c->ilen[i] = c->imax[i] = nn; 4957 c->nonzerorowcnt += (PetscInt)!!nn; 4958 c->rmax = PetscMax(c->rmax, nn); 4959 } 4960 PetscCall(MatMarkDiagonal_SeqAIJ(*C)); 4961 PetscCall(PetscMalloc1(c->nz, &c->a)); 4962 (*C)->nonzerostate++; 4963 PetscCall(PetscLayoutSetUp((*C)->rmap)); 4964 PetscCall(PetscLayoutSetUp((*C)->cmap)); 4965 Ccusp->nonzerostate = (*C)->nonzerostate; 4966 (*C)->preallocated = PETSC_TRUE; 4967 } else { 4968 PetscCheck((*C)->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, (*C)->rmap->n, B->rmap->n); 4969 c = (Mat_SeqAIJ *)(*C)->data; 4970 if (c->nz) { 4971 Ccusp = (Mat_SeqAIJCUSPARSE *)(*C)->spptr; 4972 PetscCheck(Ccusp->coords, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing coords"); 4973 PetscCheck(Ccusp->format != MAT_CUSPARSE_ELL && Ccusp->format != MAT_CUSPARSE_HYB, PETSC_COMM_SELF, PETSC_ERR_SUP, "Not implemented"); 4974 PetscCheck(Ccusp->nonzerostate == (*C)->nonzerostate, PETSC_COMM_SELF, PETSC_ERR_COR, "Wrong nonzerostate"); 4975 PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 4976 PetscCall(MatSeqAIJCUSPARSECopyToGPU(B)); 4977 PetscCheck(Acusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4978 PetscCheck(Bcusp->mat, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Mat_SeqAIJCUSPARSEMultStruct"); 4979 Acsr = (CsrMatrix *)Acusp->mat->mat; 4980 Bcsr = (CsrMatrix *)Bcusp->mat->mat; 4981 Ccsr = (CsrMatrix *)Ccusp->mat->mat; 4982 PetscCheck(Acsr->num_entries == (PetscInt)Acsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "A nnz %" PetscInt_FMT " != %" PetscInt_FMT, Acsr->num_entries, (PetscInt)Acsr->values->size()); 4983 PetscCheck(Bcsr->num_entries == (PetscInt)Bcsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "B nnz %" PetscInt_FMT " != %" PetscInt_FMT, Bcsr->num_entries, (PetscInt)Bcsr->values->size()); 4984 PetscCheck(Ccsr->num_entries == (PetscInt)Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT, Ccsr->num_entries, (PetscInt)Ccsr->values->size()); 4985 PetscCheck(Ccsr->num_entries == Acsr->num_entries + Bcsr->num_entries, PETSC_COMM_SELF, PETSC_ERR_COR, "C nnz %" PetscInt_FMT " != %" PetscInt_FMT " + %" PetscInt_FMT, Ccsr->num_entries, Acsr->num_entries, Bcsr->num_entries); 4986 PetscCheck(Ccusp->coords->size() == Ccsr->values->size(), PETSC_COMM_SELF, PETSC_ERR_COR, "permSize %" PetscInt_FMT " != %" PetscInt_FMT, (PetscInt)Ccusp->coords->size(), (PetscInt)Ccsr->values->size()); 4987 auto pmid = Ccusp->coords->begin(); 4988 thrust::advance(pmid, Acsr->num_entries); 4989 PetscCall(PetscLogGpuTimeBegin()); 4990 auto zibait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->begin()))); 4991 auto zieait = thrust::make_zip_iterator(thrust::make_tuple(Acsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4992 thrust::for_each(zibait, zieait, VecCUDAEquals()); 4993 auto zibbit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->begin(), thrust::make_permutation_iterator(Ccsr->values->begin(), pmid))); 4994 auto ziebit = thrust::make_zip_iterator(thrust::make_tuple(Bcsr->values->end(), thrust::make_permutation_iterator(Ccsr->values->begin(), Ccusp->coords->end()))); 4995 thrust::for_each(zibbit, ziebit, VecCUDAEquals()); 4996 PetscCall(MatSeqAIJCUSPARSEInvalidateTranspose(*C, PETSC_FALSE)); 4997 if (A->form_explicit_transpose && B->form_explicit_transpose && (*C)->form_explicit_transpose) { 4998 PetscCheck(Ccusp->matTranspose, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing transpose Mat_SeqAIJCUSPARSEMultStruct"); 4999 PetscBool AT = Acusp->matTranspose ? PETSC_TRUE : PETSC_FALSE, BT = Bcusp->matTranspose ? PETSC_TRUE : PETSC_FALSE; 5000 CsrMatrix *AcsrT = AT ? (CsrMatrix *)Acusp->matTranspose->mat : NULL; 5001 CsrMatrix *BcsrT = BT ? (CsrMatrix *)Bcusp->matTranspose->mat : NULL; 5002 CsrMatrix *CcsrT = (CsrMatrix *)Ccusp->matTranspose->mat; 5003 auto vT = CcsrT->values->begin(); 5004 if (AT) vT = thrust::copy(AcsrT->values->begin(), AcsrT->values->end(), vT); 5005 if (BT) thrust::copy(BcsrT->values->begin(), BcsrT->values->end(), vT); 5006 (*C)->transupdated = PETSC_TRUE; 5007 } 5008 PetscCall(PetscLogGpuTimeEnd()); 5009 } 5010 } 5011 PetscCall(PetscObjectStateIncrease((PetscObject)*C)); 5012 (*C)->assembled = PETSC_TRUE; 5013 (*C)->was_assembled = PETSC_FALSE; 5014 (*C)->offloadmask = PETSC_OFFLOAD_GPU; 5015 PetscFunctionReturn(PETSC_SUCCESS); 5016 } 5017 5018 static PetscErrorCode MatSeqAIJCopySubArray_SeqAIJCUSPARSE(Mat A, PetscInt n, const PetscInt idx[], PetscScalar v[]) 5019 { 5020 bool dmem; 5021 const PetscScalar *av; 5022 5023 PetscFunctionBegin; 5024 dmem = isCudaMem(v); 5025 PetscCall(MatSeqAIJCUSPARSEGetArrayRead(A, &av)); 5026 if (n && idx) { 5027 THRUSTINTARRAY widx(n); 5028 widx.assign(idx, idx + n); 5029 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 5030 5031 THRUSTARRAY *w = NULL; 5032 thrust::device_ptr<PetscScalar> dv; 5033 if (dmem) { 5034 dv = thrust::device_pointer_cast(v); 5035 } else { 5036 w = new THRUSTARRAY(n); 5037 dv = w->data(); 5038 } 5039 thrust::device_ptr<const PetscScalar> dav = thrust::device_pointer_cast(av); 5040 5041 auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.begin()), dv)); 5042 auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(dav, widx.end()), dv + n)); 5043 thrust::for_each(zibit, zieit, VecCUDAEquals()); 5044 if (w) PetscCallCUDA(cudaMemcpy(v, w->data().get(), n * sizeof(PetscScalar), cudaMemcpyDeviceToHost)); 5045 delete w; 5046 } else { 5047 PetscCallCUDA(cudaMemcpy(v, av, n * sizeof(PetscScalar), dmem ? cudaMemcpyDeviceToDevice : cudaMemcpyDeviceToHost)); 5048 } 5049 if (!dmem) PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 5050 PetscCall(MatSeqAIJCUSPARSERestoreArrayRead(A, &av)); 5051 PetscFunctionReturn(PETSC_SUCCESS); 5052 } 5053