1*2d1451d4SHong Zhang #include <petscconf.h> 2*2d1451d4SHong Zhang #include <../src/mat/impls/sell/mpi/mpisell.h> /*I "petscmat.h" I*/ 3*2d1451d4SHong Zhang 4*2d1451d4SHong Zhang PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[]) 5*2d1451d4SHong Zhang { 6*2d1451d4SHong Zhang Mat_MPISELL *b = (Mat_MPISELL *)B->data; 7*2d1451d4SHong Zhang 8*2d1451d4SHong Zhang PetscFunctionBegin; 9*2d1451d4SHong Zhang PetscCall(PetscLayoutSetUp(B->rmap)); 10*2d1451d4SHong Zhang PetscCall(PetscLayoutSetUp(B->cmap)); 11*2d1451d4SHong Zhang 12*2d1451d4SHong Zhang if (!B->preallocated) { 13*2d1451d4SHong Zhang /* Explicitly create 2 MATSEQSELLCUDA matrices. */ 14*2d1451d4SHong Zhang PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 15*2d1451d4SHong Zhang PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 16*2d1451d4SHong Zhang PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 17*2d1451d4SHong Zhang PetscCall(MatSetType(b->A, MATSEQSELLCUDA)); 18*2d1451d4SHong Zhang PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 19*2d1451d4SHong Zhang PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 20*2d1451d4SHong Zhang PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N)); 21*2d1451d4SHong Zhang PetscCall(MatSetType(b->B, MATSEQSELLCUDA)); 22*2d1451d4SHong Zhang } 23*2d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen)); 24*2d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen)); 25*2d1451d4SHong Zhang B->preallocated = PETSC_TRUE; 26*2d1451d4SHong Zhang B->was_assembled = PETSC_FALSE; 27*2d1451d4SHong Zhang B->assembled = PETSC_FALSE; 28*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 29*2d1451d4SHong Zhang } 30*2d1451d4SHong Zhang 31*2d1451d4SHong Zhang PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy) 32*2d1451d4SHong Zhang { 33*2d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 34*2d1451d4SHong Zhang PetscInt nt; 35*2d1451d4SHong Zhang 36*2d1451d4SHong Zhang PetscFunctionBegin; 37*2d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 38*2d1451d4SHong Zhang PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 39*2d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 40*2d1451d4SHong Zhang PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 41*2d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 42*2d1451d4SHong Zhang PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 43*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 44*2d1451d4SHong Zhang } 45*2d1451d4SHong Zhang 46*2d1451d4SHong Zhang PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 47*2d1451d4SHong Zhang { 48*2d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 49*2d1451d4SHong Zhang PetscInt nt; 50*2d1451d4SHong Zhang 51*2d1451d4SHong Zhang PetscFunctionBegin; 52*2d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 53*2d1451d4SHong Zhang PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 54*2d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 55*2d1451d4SHong Zhang PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 56*2d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 57*2d1451d4SHong Zhang PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 58*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 59*2d1451d4SHong Zhang } 60*2d1451d4SHong Zhang 61*2d1451d4SHong Zhang PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy) 62*2d1451d4SHong Zhang { 63*2d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 64*2d1451d4SHong Zhang PetscInt nt; 65*2d1451d4SHong Zhang 66*2d1451d4SHong Zhang PetscFunctionBegin; 67*2d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 68*2d1451d4SHong Zhang PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt); 69*2d1451d4SHong Zhang PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 70*2d1451d4SHong Zhang PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 71*2d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 72*2d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 73*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 74*2d1451d4SHong Zhang } 75*2d1451d4SHong Zhang 76*2d1451d4SHong Zhang PetscErrorCode MatSetFromOptions_MPISELLCUDA(PetscOptionItems *PetscOptionsObject, Mat A) 77*2d1451d4SHong Zhang { 78*2d1451d4SHong Zhang PetscFunctionBegin; 79*2d1451d4SHong Zhang PetscOptionsHeadBegin(PetscOptionsObject, "MPISELLCUDA options"); 80*2d1451d4SHong Zhang if (A->factortype == MAT_FACTOR_NONE) { } 81*2d1451d4SHong Zhang PetscOptionsHeadEnd(); 82*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 83*2d1451d4SHong Zhang } 84*2d1451d4SHong Zhang 85*2d1451d4SHong Zhang PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode) 86*2d1451d4SHong Zhang { 87*2d1451d4SHong Zhang Mat_MPISELL *mpisell; 88*2d1451d4SHong Zhang 89*2d1451d4SHong Zhang PetscFunctionBegin; 90*2d1451d4SHong Zhang mpisell = (Mat_MPISELL *)A->data; 91*2d1451d4SHong Zhang PetscCall(MatAssemblyEnd_MPISELL(A, mode)); 92*2d1451d4SHong Zhang if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { PetscCall(VecSetType(mpisell->lvec, VECSEQCUDA)); } 93*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 94*2d1451d4SHong Zhang } 95*2d1451d4SHong Zhang 96*2d1451d4SHong Zhang PetscErrorCode MatDestroy_MPISELLCUDA(Mat A) 97*2d1451d4SHong Zhang { 98*2d1451d4SHong Zhang PetscFunctionBegin; 99*2d1451d4SHong Zhang PetscCall(MatDestroy_MPISELL(A)); 100*2d1451d4SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpisellcuda_mpiaij_C", NULL)); 101*2d1451d4SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL)); 102*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 103*2d1451d4SHong Zhang } 104*2d1451d4SHong Zhang 105*2d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A) 106*2d1451d4SHong Zhang { 107*2d1451d4SHong Zhang PetscFunctionBegin; 108*2d1451d4SHong Zhang PetscCall(MatCreate_MPISELL(A)); 109*2d1451d4SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA)); 110*2d1451d4SHong Zhang PetscCall(PetscFree(A->defaultvectype)); 111*2d1451d4SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 112*2d1451d4SHong Zhang 113*2d1451d4SHong Zhang A->ops->assemblyend = MatAssemblyEnd_MPISELLCUDA; 114*2d1451d4SHong Zhang A->ops->mult = MatMult_MPISELLCUDA; 115*2d1451d4SHong Zhang A->ops->multadd = MatMultAdd_MPISELLCUDA; 116*2d1451d4SHong Zhang A->ops->multtranspose = MatMultTranspose_MPISELLCUDA; 117*2d1451d4SHong Zhang A->ops->destroy = MatDestroy_MPISELLCUDA; 118*2d1451d4SHong Zhang 119*2d1451d4SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA)); 120*2d1451d4SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpisellcuda_mpiaij_C", MatConvert_MPISELL_MPIAIJ)); 121*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 122*2d1451d4SHong Zhang } 123*2d1451d4SHong Zhang 124*2d1451d4SHong Zhang /*@ 125*2d1451d4SHong Zhang MatCreateSELLCUDA - Creates a sparse matrix in SELL format. 126*2d1451d4SHong Zhang This matrix will ultimately pushed down to NVIDIA GPUs. 127*2d1451d4SHong Zhang 128*2d1451d4SHong Zhang Collective 129*2d1451d4SHong Zhang 130*2d1451d4SHong Zhang Input Parameters: 131*2d1451d4SHong Zhang + comm - MPI communicator, set to `PETSC_COMM_SELF` 132*2d1451d4SHong Zhang . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 133*2d1451d4SHong Zhang This value should be the same as the local size used in creating the 134*2d1451d4SHong Zhang y vector for the matrix-vector product y = Ax. 135*2d1451d4SHong Zhang . n - This value should be the same as the local size used in creating the 136*2d1451d4SHong Zhang x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 137*2d1451d4SHong Zhang calculated if `N` is given) For square matrices `n` is almost always `m`. 138*2d1451d4SHong Zhang . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 139*2d1451d4SHong Zhang . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 140*2d1451d4SHong Zhang . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 141*2d1451d4SHong Zhang (same value is used for all local rows) 142*2d1451d4SHong Zhang . d_nnz - array containing the number of nonzeros in the various rows of the 143*2d1451d4SHong Zhang DIAGONAL portion of the local submatrix (possibly different for each row) 144*2d1451d4SHong Zhang or `NULL`, if `d_nz` is used to specify the nonzero structure. 145*2d1451d4SHong Zhang The size of this array is equal to the number of local rows, i.e `m`. 146*2d1451d4SHong Zhang For matrices you plan to factor you must leave room for the diagonal entry and 147*2d1451d4SHong Zhang put in the entry even if it is zero. 148*2d1451d4SHong Zhang . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 149*2d1451d4SHong Zhang submatrix (same value is used for all local rows). 150*2d1451d4SHong Zhang - o_nnz - array containing the number of nonzeros in the various rows of the 151*2d1451d4SHong Zhang OFF-DIAGONAL portion of the local submatrix (possibly different for 152*2d1451d4SHong Zhang each row) or `NULL`, if `o_nz` is used to specify the nonzero 153*2d1451d4SHong Zhang structure. The size of this array is equal to the number 154*2d1451d4SHong Zhang of local rows, i.e `m`. 155*2d1451d4SHong Zhang 156*2d1451d4SHong Zhang Output Parameter: 157*2d1451d4SHong Zhang . A - the matrix 158*2d1451d4SHong Zhang 159*2d1451d4SHong Zhang Level: intermediate 160*2d1451d4SHong Zhang 161*2d1451d4SHong Zhang Notes: 162*2d1451d4SHong Zhang If `nnz` is given then `nz` is ignored 163*2d1451d4SHong Zhang 164*2d1451d4SHong Zhang Specify the preallocated storage with either `nz` or `nnz` (not both). 165*2d1451d4SHong Zhang Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 166*2d1451d4SHong Zhang allocation. 167*2d1451d4SHong Zhang 168*2d1451d4SHong Zhang .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA` 169*2d1451d4SHong Zhang @*/ 170*2d1451d4SHong Zhang PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 171*2d1451d4SHong Zhang { 172*2d1451d4SHong Zhang PetscMPIInt size; 173*2d1451d4SHong Zhang 174*2d1451d4SHong Zhang PetscFunctionBegin; 175*2d1451d4SHong Zhang PetscCall(MatCreate(comm, A)); 176*2d1451d4SHong Zhang PetscCall(MatSetSizes(*A, m, n, M, N)); 177*2d1451d4SHong Zhang PetscCallMPI(MPI_Comm_size(comm, &size)); 178*2d1451d4SHong Zhang if (size > 1) { 179*2d1451d4SHong Zhang PetscCall(MatSetType(*A, MATMPISELLCUDA)); 180*2d1451d4SHong Zhang PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 181*2d1451d4SHong Zhang } else { 182*2d1451d4SHong Zhang PetscCall(MatSetType(*A, MATSEQSELLCUDA)); 183*2d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz)); 184*2d1451d4SHong Zhang } 185*2d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 186*2d1451d4SHong Zhang } 187*2d1451d4SHong Zhang 188*2d1451d4SHong Zhang /*MC 189*2d1451d4SHong Zhang MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices. 190*2d1451d4SHong Zhang 191*2d1451d4SHong Zhang Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs. 192*2d1451d4SHong Zhang 193*2d1451d4SHong Zhang This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator, 194*2d1451d4SHong Zhang and `MATMPISELLCUDA` otherwise. As a result, for single process communicators, 195*2d1451d4SHong Zhang `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 196*2d1451d4SHong Zhang for communicators controlling multiple processes. It is recommended that you call both of 197*2d1451d4SHong Zhang the above preallocation routines for simplicity. 198*2d1451d4SHong Zhang 199*2d1451d4SHong Zhang Options Database Key: 200*2d1451d4SHong Zhang . -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions() 201*2d1451d4SHong Zhang 202*2d1451d4SHong Zhang Level: beginner 203*2d1451d4SHong Zhang 204*2d1451d4SHong Zhang .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()` 205*2d1451d4SHong Zhang M*/ 206