12d1451d4SHong Zhang #include <petscconf.h> 2*8df136f9SHong Zhang #include <petscdevice.h> 32d1451d4SHong Zhang #include <../src/mat/impls/sell/mpi/mpisell.h> /*I "petscmat.h" I*/ 42d1451d4SHong Zhang 52d1451d4SHong Zhang PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[]) 62d1451d4SHong Zhang { 72d1451d4SHong Zhang Mat_MPISELL *b = (Mat_MPISELL *)B->data; 82d1451d4SHong Zhang 92d1451d4SHong Zhang PetscFunctionBegin; 102d1451d4SHong Zhang PetscCall(PetscLayoutSetUp(B->rmap)); 112d1451d4SHong Zhang PetscCall(PetscLayoutSetUp(B->cmap)); 122d1451d4SHong Zhang 132d1451d4SHong Zhang if (!B->preallocated) { 142d1451d4SHong Zhang /* Explicitly create 2 MATSEQSELLCUDA matrices. */ 152d1451d4SHong Zhang PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 162d1451d4SHong Zhang PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 172d1451d4SHong Zhang PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 182d1451d4SHong Zhang PetscCall(MatSetType(b->A, MATSEQSELLCUDA)); 192d1451d4SHong Zhang PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 202d1451d4SHong Zhang PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 212d1451d4SHong Zhang PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N)); 222d1451d4SHong Zhang PetscCall(MatSetType(b->B, MATSEQSELLCUDA)); 232d1451d4SHong Zhang } 242d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen)); 252d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen)); 262d1451d4SHong Zhang B->preallocated = PETSC_TRUE; 272d1451d4SHong Zhang B->was_assembled = PETSC_FALSE; 282d1451d4SHong Zhang B->assembled = PETSC_FALSE; 292d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 302d1451d4SHong Zhang } 312d1451d4SHong Zhang 322d1451d4SHong Zhang PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy) 332d1451d4SHong Zhang { 342d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 352d1451d4SHong Zhang PetscInt nt; 362d1451d4SHong Zhang 372d1451d4SHong Zhang PetscFunctionBegin; 382d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 392d1451d4SHong Zhang PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 402d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 412d1451d4SHong Zhang PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 422d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 432d1451d4SHong Zhang PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 442d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 452d1451d4SHong Zhang } 462d1451d4SHong Zhang 47*8df136f9SHong Zhang PetscErrorCode MatZeroEntries_MPISELLCUDA(Mat A) 48*8df136f9SHong Zhang { 49*8df136f9SHong Zhang Mat_MPISELL *l = (Mat_MPISELL *)A->data; 50*8df136f9SHong Zhang 51*8df136f9SHong Zhang PetscFunctionBegin; 52*8df136f9SHong Zhang PetscCall(MatZeroEntries(l->A)); 53*8df136f9SHong Zhang PetscCall(MatZeroEntries(l->B)); 54*8df136f9SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 55*8df136f9SHong Zhang } 56*8df136f9SHong Zhang 572d1451d4SHong Zhang PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz) 582d1451d4SHong Zhang { 592d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 602d1451d4SHong Zhang PetscInt nt; 612d1451d4SHong Zhang 622d1451d4SHong Zhang PetscFunctionBegin; 632d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 642d1451d4SHong Zhang PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt); 652d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 662d1451d4SHong Zhang PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 672d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 682d1451d4SHong Zhang PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 692d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 702d1451d4SHong Zhang } 712d1451d4SHong Zhang 722d1451d4SHong Zhang PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy) 732d1451d4SHong Zhang { 742d1451d4SHong Zhang Mat_MPISELL *a = (Mat_MPISELL *)A->data; 752d1451d4SHong Zhang PetscInt nt; 762d1451d4SHong Zhang 772d1451d4SHong Zhang PetscFunctionBegin; 782d1451d4SHong Zhang PetscCall(VecGetLocalSize(xx, &nt)); 792d1451d4SHong Zhang PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt); 80*8df136f9SHong Zhang PetscUseTypeMethod(a->B, multtranspose, xx, a->lvec); 81*8df136f9SHong Zhang PetscUseTypeMethod(a->A, multtranspose, xx, yy); 822d1451d4SHong Zhang PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 832d1451d4SHong Zhang PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 842d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 852d1451d4SHong Zhang } 862d1451d4SHong Zhang 87*8df136f9SHong Zhang PetscErrorCode MatSetFromOptions_MPISELLCUDA(Mat, PetscOptionItems *) 882d1451d4SHong Zhang { 89*8df136f9SHong Zhang return PETSC_SUCCESS; 902d1451d4SHong Zhang } 912d1451d4SHong Zhang 922d1451d4SHong Zhang PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode) 932d1451d4SHong Zhang { 942d1451d4SHong Zhang PetscFunctionBegin; 952d1451d4SHong Zhang PetscCall(MatAssemblyEnd_MPISELL(A, mode)); 96*8df136f9SHong Zhang if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(VecSetType(((Mat_MPISELL *)A->data)->lvec, VECSEQCUDA)); 972d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 982d1451d4SHong Zhang } 992d1451d4SHong Zhang 1002d1451d4SHong Zhang PetscErrorCode MatDestroy_MPISELLCUDA(Mat A) 1012d1451d4SHong Zhang { 1022d1451d4SHong Zhang PetscFunctionBegin; 1032d1451d4SHong Zhang PetscCall(MatDestroy_MPISELL(A)); 1042d1451d4SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL)); 1052d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 1062d1451d4SHong Zhang } 1072d1451d4SHong Zhang 108*8df136f9SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat B, MatType, MatReuse reuse, Mat *newmat) 109*8df136f9SHong Zhang { 110*8df136f9SHong Zhang Mat_MPISELL *a; 111*8df136f9SHong Zhang Mat A; 112*8df136f9SHong Zhang 113*8df136f9SHong Zhang PetscFunctionBegin; 114*8df136f9SHong Zhang PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 115*8df136f9SHong Zhang if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 116*8df136f9SHong Zhang else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 117*8df136f9SHong Zhang A = *newmat; 118*8df136f9SHong Zhang A->boundtocpu = PETSC_FALSE; 119*8df136f9SHong Zhang PetscCall(PetscFree(A->defaultvectype)); 120*8df136f9SHong Zhang PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 121*8df136f9SHong Zhang 122*8df136f9SHong Zhang a = (Mat_MPISELL *)A->data; 123*8df136f9SHong Zhang if (a->A) PetscCall(MatSetType(a->A, MATSEQSELLCUDA)); 124*8df136f9SHong Zhang if (a->B) PetscCall(MatSetType(a->B, MATSEQSELLCUDA)); 125*8df136f9SHong Zhang if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 126*8df136f9SHong Zhang 127*8df136f9SHong Zhang A->ops->assemblyend = MatAssemblyEnd_MPISELLCUDA; 128*8df136f9SHong Zhang A->ops->mult = MatMult_MPISELLCUDA; 129*8df136f9SHong Zhang A->ops->multadd = MatMultAdd_MPISELLCUDA; 130*8df136f9SHong Zhang A->ops->multtranspose = MatMultTranspose_MPISELLCUDA; 131*8df136f9SHong Zhang A->ops->setfromoptions = MatSetFromOptions_MPISELLCUDA; 132*8df136f9SHong Zhang A->ops->destroy = MatDestroy_MPISELLCUDA; 133*8df136f9SHong Zhang A->ops->zeroentries = MatZeroEntries_MPISELLCUDA; 134*8df136f9SHong Zhang 135*8df136f9SHong Zhang PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA)); 136*8df136f9SHong Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA)); 137*8df136f9SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 138*8df136f9SHong Zhang } 139*8df136f9SHong Zhang 1402d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A) 1412d1451d4SHong Zhang { 1422d1451d4SHong Zhang PetscFunctionBegin; 143*8df136f9SHong Zhang PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 1442d1451d4SHong Zhang PetscCall(MatCreate_MPISELL(A)); 145*8df136f9SHong Zhang PetscCall(MatConvert_MPISELL_MPISELLCUDA(A, MATMPISELLCUDA, MAT_INPLACE_MATRIX, &A)); 1462d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 1472d1451d4SHong Zhang } 1482d1451d4SHong Zhang 1492d1451d4SHong Zhang /*@ 1502d1451d4SHong Zhang MatCreateSELLCUDA - Creates a sparse matrix in SELL format. 1512d1451d4SHong Zhang This matrix will ultimately pushed down to NVIDIA GPUs. 1522d1451d4SHong Zhang 1532d1451d4SHong Zhang Collective 1542d1451d4SHong Zhang 1552d1451d4SHong Zhang Input Parameters: 1562d1451d4SHong Zhang + comm - MPI communicator, set to `PETSC_COMM_SELF` 1572d1451d4SHong Zhang . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given) 1582d1451d4SHong Zhang This value should be the same as the local size used in creating the 1592d1451d4SHong Zhang y vector for the matrix-vector product y = Ax. 1602d1451d4SHong Zhang . n - This value should be the same as the local size used in creating the 1612d1451d4SHong Zhang x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have 1622d1451d4SHong Zhang calculated if `N` is given) For square matrices `n` is almost always `m`. 1632d1451d4SHong Zhang . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given) 1642d1451d4SHong Zhang . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given) 1652d1451d4SHong Zhang . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix 1662d1451d4SHong Zhang (same value is used for all local rows) 1672d1451d4SHong Zhang . d_nnz - array containing the number of nonzeros in the various rows of the 1682d1451d4SHong Zhang DIAGONAL portion of the local submatrix (possibly different for each row) 1692d1451d4SHong Zhang or `NULL`, if `d_nz` is used to specify the nonzero structure. 1702d1451d4SHong Zhang The size of this array is equal to the number of local rows, i.e `m`. 1712d1451d4SHong Zhang For matrices you plan to factor you must leave room for the diagonal entry and 1722d1451d4SHong Zhang put in the entry even if it is zero. 1732d1451d4SHong Zhang . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local 1742d1451d4SHong Zhang submatrix (same value is used for all local rows). 1752d1451d4SHong Zhang - o_nnz - array containing the number of nonzeros in the various rows of the 1762d1451d4SHong Zhang OFF-DIAGONAL portion of the local submatrix (possibly different for 1772d1451d4SHong Zhang each row) or `NULL`, if `o_nz` is used to specify the nonzero 1782d1451d4SHong Zhang structure. The size of this array is equal to the number 1792d1451d4SHong Zhang of local rows, i.e `m`. 1802d1451d4SHong Zhang 1812d1451d4SHong Zhang Output Parameter: 1822d1451d4SHong Zhang . A - the matrix 1832d1451d4SHong Zhang 1842d1451d4SHong Zhang Level: intermediate 1852d1451d4SHong Zhang 1862d1451d4SHong Zhang Notes: 1872d1451d4SHong Zhang If `nnz` is given then `nz` is ignored 1882d1451d4SHong Zhang 1892d1451d4SHong Zhang Specify the preallocated storage with either `nz` or `nnz` (not both). 1902d1451d4SHong Zhang Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1912d1451d4SHong Zhang allocation. 1922d1451d4SHong Zhang 1932d1451d4SHong Zhang .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA` 1942d1451d4SHong Zhang @*/ 1952d1451d4SHong Zhang PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 1962d1451d4SHong Zhang { 1972d1451d4SHong Zhang PetscMPIInt size; 1982d1451d4SHong Zhang 1992d1451d4SHong Zhang PetscFunctionBegin; 2002d1451d4SHong Zhang PetscCall(MatCreate(comm, A)); 2012d1451d4SHong Zhang PetscCall(MatSetSizes(*A, m, n, M, N)); 2022d1451d4SHong Zhang PetscCallMPI(MPI_Comm_size(comm, &size)); 2032d1451d4SHong Zhang if (size > 1) { 2042d1451d4SHong Zhang PetscCall(MatSetType(*A, MATMPISELLCUDA)); 2052d1451d4SHong Zhang PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 2062d1451d4SHong Zhang } else { 2072d1451d4SHong Zhang PetscCall(MatSetType(*A, MATSEQSELLCUDA)); 2082d1451d4SHong Zhang PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz)); 2092d1451d4SHong Zhang } 2102d1451d4SHong Zhang PetscFunctionReturn(PETSC_SUCCESS); 2112d1451d4SHong Zhang } 2122d1451d4SHong Zhang 2132d1451d4SHong Zhang /*MC 2142d1451d4SHong Zhang MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices. 2152d1451d4SHong Zhang 2162d1451d4SHong Zhang Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs. 2172d1451d4SHong Zhang 2182d1451d4SHong Zhang This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator, 2192d1451d4SHong Zhang and `MATMPISELLCUDA` otherwise. As a result, for single process communicators, 2202d1451d4SHong Zhang `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported 2212d1451d4SHong Zhang for communicators controlling multiple processes. It is recommended that you call both of 2222d1451d4SHong Zhang the above preallocation routines for simplicity. 2232d1451d4SHong Zhang 2242d1451d4SHong Zhang Options Database Key: 2252d1451d4SHong Zhang . -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions() 2262d1451d4SHong Zhang 2272d1451d4SHong Zhang Level: beginner 2282d1451d4SHong Zhang 2292d1451d4SHong Zhang .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()` 2302d1451d4SHong Zhang M*/ 231