1 #include <petscconf.h> 2 #include <petscdevice.h> 3 #include <../src/mat/impls/sell/mpi/mpisell.h> /*I "petscmat.h" I*/ 4 5 PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[]) 6 { 7 Mat_MPISELL *b = (Mat_MPISELL *)B->data; 8 9 PetscFunctionBegin; 10 PetscCall(PetscLayoutSetUp(B->rmap)); 11 PetscCall(PetscLayoutSetUp(B->cmap)); 12 13 if (!B->preallocated) { 14 /* Explicitly create 2 MATSEQSELLCUDA matrices. */ 15 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 16 PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 17 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 18 PetscCall(MatSetType(b->A, MATSEQSELLCUDA)); 19 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 20 PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 21 PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N)); 22 PetscCall(MatSetType(b->B, MATSEQSELLCUDA)); 23 } 24 PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen)); 25 PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen)); 26 B->preallocated = PETSC_TRUE; 27 B->was_assembled = PETSC_FALSE; 28 B->assembled = PETSC_FALSE; 29 PetscFunctionReturn(PETSC_SUCCESS); 30 } 31 32 PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy) 33 { 34 Mat_MPISELL *a = (Mat_MPISELL *)A->data; 35 PetscInt nt; 36 37 PetscFunctionBegin; 38 PetscCall(VecGetLocalSize(xx, &nt)); 39 PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 40 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 41 PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 42 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 43 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 44 PetscFunctionReturn(PETSC_SUCCESS); 45 } 46 47 PetscErrorCode MatZeroEntries_MPISELLCUDA(Mat A) 48 { 49 Mat_MPISELL *l = (Mat_MPISELL *)A->data; 50 51 PetscFunctionBegin; 52 PetscCall(MatZeroEntries(l->A)); 53 PetscCall(MatZeroEntries(l->B)); 54 PetscFunctionReturn(PETSC_SUCCESS); 55 } 56 57 PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 58 { 59 Mat_MPISELL *a = (Mat_MPISELL *)A->data; 60 PetscInt nt; 61 62 PetscFunctionBegin; 63 PetscCall(VecGetLocalSize(xx, &nt)); 64 PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 65 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 66 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 67 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 68 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 69 PetscFunctionReturn(PETSC_SUCCESS); 70 } 71 72 PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy) 73 { 74 Mat_MPISELL *a = (Mat_MPISELL *)A->data; 75 PetscInt nt; 76 77 PetscFunctionBegin; 78 PetscCall(VecGetLocalSize(xx, &nt)); 79 PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt); 80 PetscUseTypeMethod(a->B, multtranspose, xx, a->lvec); 81 PetscUseTypeMethod(a->A, multtranspose, xx, yy); 82 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 83 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 84 PetscFunctionReturn(PETSC_SUCCESS); 85 } 86 87 PetscErrorCode MatSetFromOptions_MPISELLCUDA(Mat, PetscOptionItems *) 88 { 89 return PETSC_SUCCESS; 90 } 91 92 PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode) 93 { 94 PetscFunctionBegin; 95 PetscCall(MatAssemblyEnd_MPISELL(A, mode)); 96 if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(VecSetType(((Mat_MPISELL *)A->data)->lvec, VECSEQCUDA)); 97 PetscFunctionReturn(PETSC_SUCCESS); 98 } 99 100 PetscErrorCode MatDestroy_MPISELLCUDA(Mat A) 101 { 102 PetscFunctionBegin; 103 PetscCall(MatDestroy_MPISELL(A)); 104 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL)); 105 PetscFunctionReturn(PETSC_SUCCESS); 106 } 107 108 PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat B, MatType, MatReuse reuse, Mat *newmat) 109 { 110 Mat_MPISELL *a; 111 Mat A; 112 113 PetscFunctionBegin; 114 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 115 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 116 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 117 A = *newmat; 118 A->boundtocpu = PETSC_FALSE; 119 PetscCall(PetscFree(A->defaultvectype)); 120 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 121 122 a = (Mat_MPISELL *)A->data; 123 if (a->A) PetscCall(MatSetType(a->A, MATSEQSELLCUDA)); 124 if (a->B) PetscCall(MatSetType(a->B, MATSEQSELLCUDA)); 125 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 126 127 A->ops->assemblyend = MatAssemblyEnd_MPISELLCUDA; 128 A->ops->mult = MatMult_MPISELLCUDA; 129 A->ops->multadd = MatMultAdd_MPISELLCUDA; 130 A->ops->multtranspose = MatMultTranspose_MPISELLCUDA; 131 A->ops->setfromoptions = MatSetFromOptions_MPISELLCUDA; 132 A->ops->destroy = MatDestroy_MPISELLCUDA; 133 A->ops->zeroentries = MatZeroEntries_MPISELLCUDA; 134 135 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA)); 136 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA)); 137 PetscFunctionReturn(PETSC_SUCCESS); 138 } 139 140 PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A) 141 { 142 PetscFunctionBegin; 143 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 144 PetscCall(MatCreate_MPISELL(A)); 145 PetscCall(MatConvert_MPISELL_MPISELLCUDA(A, MATMPISELLCUDA, MAT_INPLACE_MATRIX, &A)); 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 /*@ 150 MatCreateSELLCUDA - Creates a sparse matrix in SELL format. 151 This matrix will ultimately pushed down to NVIDIA GPUs. 152 153 Collective 154 155 Input Parameters: 156 + comm - MPI communicator, set to `PETSC_COMM_SELF` 157 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 158 This value should be the same as the local size used in creating the 159 y vector for the matrix-vector product y = Ax. 160 . n - This value should be the same as the local size used in creating the 161 x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 162 calculated if `N` is given) For square matrices `n` is almost always `m`. 163 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 164 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 165 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 166 (same value is used for all local rows) 167 . d_nnz - array containing the number of nonzeros in the various rows of the 168 DIAGONAL portion of the local submatrix (possibly different for each row) 169 or `NULL`, if `d_nz` is used to specify the nonzero structure. 170 The size of this array is equal to the number of local rows, i.e `m`. 171 For matrices you plan to factor you must leave room for the diagonal entry and 172 put in the entry even if it is zero. 173 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 174 submatrix (same value is used for all local rows). 175 - o_nnz - array containing the number of nonzeros in the various rows of the 176 OFF-DIAGONAL portion of the local submatrix (possibly different for 177 each row) or `NULL`, if `o_nz` is used to specify the nonzero 178 structure. The size of this array is equal to the number 179 of local rows, i.e `m`. 180 181 Output Parameter: 182 . A - the matrix 183 184 Level: intermediate 185 186 Notes: 187 If `nnz` is given then `nz` is ignored 188 189 Specify the preallocated storage with either `nz` or `nnz` (not both). 190 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 191 allocation. 192 193 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA` 194 @*/ 195 PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 196 { 197 PetscMPIInt size; 198 199 PetscFunctionBegin; 200 PetscCall(MatCreate(comm, A)); 201 PetscCall(MatSetSizes(*A, m, n, M, N)); 202 PetscCallMPI(MPI_Comm_size(comm, &size)); 203 if (size > 1) { 204 PetscCall(MatSetType(*A, MATMPISELLCUDA)); 205 PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 206 } else { 207 PetscCall(MatSetType(*A, MATSEQSELLCUDA)); 208 PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz)); 209 } 210 PetscFunctionReturn(PETSC_SUCCESS); 211 } 212 213 /*MC 214 MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices. 215 216 Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs. 217 218 This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator, 219 and `MATMPISELLCUDA` otherwise. As a result, for single process communicators, 220 `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 221 for communicators controlling multiple processes. It is recommended that you call both of 222 the above preallocation routines for simplicity. 223 224 Options Database Key: 225 . -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions() 226 227 Level: beginner 228 229 .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()` 230 M*/ 231