xref: /petsc/src/mat/impls/sell/mpi/mpicuda/mpisellcuda.cu (revision 8df136f97abf49e837a1a7f058b7329c31cd7d87)
12d1451d4SHong Zhang #include <petscconf.h>
2*8df136f9SHong Zhang #include <petscdevice.h>
32d1451d4SHong Zhang #include <../src/mat/impls/sell/mpi/mpisell.h> /*I "petscmat.h" I*/
42d1451d4SHong Zhang 
52d1451d4SHong Zhang PetscErrorCode MatMPISELLSetPreallocation_MPISELLCUDA(Mat B, PetscInt d_rlenmax, const PetscInt d_rlen[], PetscInt o_rlenmax, const PetscInt o_rlen[])
62d1451d4SHong Zhang {
72d1451d4SHong Zhang   Mat_MPISELL *b = (Mat_MPISELL *)B->data;
82d1451d4SHong Zhang 
92d1451d4SHong Zhang   PetscFunctionBegin;
102d1451d4SHong Zhang   PetscCall(PetscLayoutSetUp(B->rmap));
112d1451d4SHong Zhang   PetscCall(PetscLayoutSetUp(B->cmap));
122d1451d4SHong Zhang 
132d1451d4SHong Zhang   if (!B->preallocated) {
142d1451d4SHong Zhang     /* Explicitly create 2 MATSEQSELLCUDA matrices. */
152d1451d4SHong Zhang     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
162d1451d4SHong Zhang     PetscCall(MatBindToCPU(b->A, B->boundtocpu));
172d1451d4SHong Zhang     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
182d1451d4SHong Zhang     PetscCall(MatSetType(b->A, MATSEQSELLCUDA));
192d1451d4SHong Zhang     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
202d1451d4SHong Zhang     PetscCall(MatBindToCPU(b->B, B->boundtocpu));
212d1451d4SHong Zhang     PetscCall(MatSetSizes(b->B, B->rmap->n, B->cmap->N, B->rmap->n, B->cmap->N));
222d1451d4SHong Zhang     PetscCall(MatSetType(b->B, MATSEQSELLCUDA));
232d1451d4SHong Zhang   }
242d1451d4SHong Zhang   PetscCall(MatSeqSELLSetPreallocation(b->A, d_rlenmax, d_rlen));
252d1451d4SHong Zhang   PetscCall(MatSeqSELLSetPreallocation(b->B, o_rlenmax, o_rlen));
262d1451d4SHong Zhang   B->preallocated  = PETSC_TRUE;
272d1451d4SHong Zhang   B->was_assembled = PETSC_FALSE;
282d1451d4SHong Zhang   B->assembled     = PETSC_FALSE;
292d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
302d1451d4SHong Zhang }
312d1451d4SHong Zhang 
322d1451d4SHong Zhang PetscErrorCode MatMult_MPISELLCUDA(Mat A, Vec xx, Vec yy)
332d1451d4SHong Zhang {
342d1451d4SHong Zhang   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
352d1451d4SHong Zhang   PetscInt     nt;
362d1451d4SHong Zhang 
372d1451d4SHong Zhang   PetscFunctionBegin;
382d1451d4SHong Zhang   PetscCall(VecGetLocalSize(xx, &nt));
392d1451d4SHong Zhang   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
402d1451d4SHong Zhang   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
412d1451d4SHong Zhang   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
422d1451d4SHong Zhang   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
432d1451d4SHong Zhang   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
442d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
452d1451d4SHong Zhang }
462d1451d4SHong Zhang 
47*8df136f9SHong Zhang PetscErrorCode MatZeroEntries_MPISELLCUDA(Mat A)
48*8df136f9SHong Zhang {
49*8df136f9SHong Zhang   Mat_MPISELL *l = (Mat_MPISELL *)A->data;
50*8df136f9SHong Zhang 
51*8df136f9SHong Zhang   PetscFunctionBegin;
52*8df136f9SHong Zhang   PetscCall(MatZeroEntries(l->A));
53*8df136f9SHong Zhang   PetscCall(MatZeroEntries(l->B));
54*8df136f9SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
55*8df136f9SHong Zhang }
56*8df136f9SHong Zhang 
572d1451d4SHong Zhang PetscErrorCode MatMultAdd_MPISELLCUDA(Mat A, Vec xx, Vec yy, Vec zz)
582d1451d4SHong Zhang {
592d1451d4SHong Zhang   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
602d1451d4SHong Zhang   PetscInt     nt;
612d1451d4SHong Zhang 
622d1451d4SHong Zhang   PetscFunctionBegin;
632d1451d4SHong Zhang   PetscCall(VecGetLocalSize(xx, &nt));
642d1451d4SHong Zhang   PetscCheck(nt == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->cmap->n, nt);
652d1451d4SHong Zhang   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
662d1451d4SHong Zhang   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
672d1451d4SHong Zhang   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
682d1451d4SHong Zhang   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
692d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
702d1451d4SHong Zhang }
712d1451d4SHong Zhang 
722d1451d4SHong Zhang PetscErrorCode MatMultTranspose_MPISELLCUDA(Mat A, Vec xx, Vec yy)
732d1451d4SHong Zhang {
742d1451d4SHong Zhang   Mat_MPISELL *a = (Mat_MPISELL *)A->data;
752d1451d4SHong Zhang   PetscInt     nt;
762d1451d4SHong Zhang 
772d1451d4SHong Zhang   PetscFunctionBegin;
782d1451d4SHong Zhang   PetscCall(VecGetLocalSize(xx, &nt));
792d1451d4SHong Zhang   PetscCheck(nt == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")", A->rmap->n, nt);
80*8df136f9SHong Zhang   PetscUseTypeMethod(a->B, multtranspose, xx, a->lvec);
81*8df136f9SHong Zhang   PetscUseTypeMethod(a->A, multtranspose, xx, yy);
822d1451d4SHong Zhang   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
832d1451d4SHong Zhang   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
842d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
852d1451d4SHong Zhang }
862d1451d4SHong Zhang 
87*8df136f9SHong Zhang PetscErrorCode MatSetFromOptions_MPISELLCUDA(Mat, PetscOptionItems *)
882d1451d4SHong Zhang {
89*8df136f9SHong Zhang   return PETSC_SUCCESS;
902d1451d4SHong Zhang }
912d1451d4SHong Zhang 
922d1451d4SHong Zhang PetscErrorCode MatAssemblyEnd_MPISELLCUDA(Mat A, MatAssemblyType mode)
932d1451d4SHong Zhang {
942d1451d4SHong Zhang   PetscFunctionBegin;
952d1451d4SHong Zhang   PetscCall(MatAssemblyEnd_MPISELL(A, mode));
96*8df136f9SHong Zhang   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) PetscCall(VecSetType(((Mat_MPISELL *)A->data)->lvec, VECSEQCUDA));
972d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
982d1451d4SHong Zhang }
992d1451d4SHong Zhang 
1002d1451d4SHong Zhang PetscErrorCode MatDestroy_MPISELLCUDA(Mat A)
1012d1451d4SHong Zhang {
1022d1451d4SHong Zhang   PetscFunctionBegin;
1032d1451d4SHong Zhang   PetscCall(MatDestroy_MPISELL(A));
1042d1451d4SHong Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", NULL));
1052d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1062d1451d4SHong Zhang }
1072d1451d4SHong Zhang 
108*8df136f9SHong Zhang PETSC_INTERN PetscErrorCode MatConvert_MPISELL_MPISELLCUDA(Mat B, MatType, MatReuse reuse, Mat *newmat)
109*8df136f9SHong Zhang {
110*8df136f9SHong Zhang   Mat_MPISELL *a;
111*8df136f9SHong Zhang   Mat          A;
112*8df136f9SHong Zhang 
113*8df136f9SHong Zhang   PetscFunctionBegin;
114*8df136f9SHong Zhang   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
115*8df136f9SHong Zhang   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
116*8df136f9SHong Zhang   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
117*8df136f9SHong Zhang   A             = *newmat;
118*8df136f9SHong Zhang   A->boundtocpu = PETSC_FALSE;
119*8df136f9SHong Zhang   PetscCall(PetscFree(A->defaultvectype));
120*8df136f9SHong Zhang   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
121*8df136f9SHong Zhang 
122*8df136f9SHong Zhang   a = (Mat_MPISELL *)A->data;
123*8df136f9SHong Zhang   if (a->A) PetscCall(MatSetType(a->A, MATSEQSELLCUDA));
124*8df136f9SHong Zhang   if (a->B) PetscCall(MatSetType(a->B, MATSEQSELLCUDA));
125*8df136f9SHong Zhang   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
126*8df136f9SHong Zhang 
127*8df136f9SHong Zhang   A->ops->assemblyend    = MatAssemblyEnd_MPISELLCUDA;
128*8df136f9SHong Zhang   A->ops->mult           = MatMult_MPISELLCUDA;
129*8df136f9SHong Zhang   A->ops->multadd        = MatMultAdd_MPISELLCUDA;
130*8df136f9SHong Zhang   A->ops->multtranspose  = MatMultTranspose_MPISELLCUDA;
131*8df136f9SHong Zhang   A->ops->setfromoptions = MatSetFromOptions_MPISELLCUDA;
132*8df136f9SHong Zhang   A->ops->destroy        = MatDestroy_MPISELLCUDA;
133*8df136f9SHong Zhang   A->ops->zeroentries    = MatZeroEntries_MPISELLCUDA;
134*8df136f9SHong Zhang 
135*8df136f9SHong Zhang   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPISELLCUDA));
136*8df136f9SHong Zhang   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPISELLSetPreallocation_C", MatMPISELLSetPreallocation_MPISELLCUDA));
137*8df136f9SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
138*8df136f9SHong Zhang }
139*8df136f9SHong Zhang 
1402d1451d4SHong Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPISELLCUDA(Mat A)
1412d1451d4SHong Zhang {
1422d1451d4SHong Zhang   PetscFunctionBegin;
143*8df136f9SHong Zhang   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
1442d1451d4SHong Zhang   PetscCall(MatCreate_MPISELL(A));
145*8df136f9SHong Zhang   PetscCall(MatConvert_MPISELL_MPISELLCUDA(A, MATMPISELLCUDA, MAT_INPLACE_MATRIX, &A));
1462d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
1472d1451d4SHong Zhang }
1482d1451d4SHong Zhang 
1492d1451d4SHong Zhang /*@
1502d1451d4SHong Zhang    MatCreateSELLCUDA - Creates a sparse matrix in SELL format.
1512d1451d4SHong Zhang    This matrix will ultimately pushed down to NVIDIA GPUs.
1522d1451d4SHong Zhang 
1532d1451d4SHong Zhang    Collective
1542d1451d4SHong Zhang 
1552d1451d4SHong Zhang    Input Parameters:
1562d1451d4SHong Zhang +  comm - MPI communicator, set to `PETSC_COMM_SELF`
1572d1451d4SHong Zhang .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
1582d1451d4SHong Zhang            This value should be the same as the local size used in creating the
1592d1451d4SHong Zhang            y vector for the matrix-vector product y = Ax.
1602d1451d4SHong Zhang .  n - This value should be the same as the local size used in creating the
1612d1451d4SHong Zhang        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
1622d1451d4SHong Zhang        calculated if `N` is given) For square matrices `n` is almost always `m`.
1632d1451d4SHong Zhang .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
1642d1451d4SHong Zhang .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
1652d1451d4SHong Zhang .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
1662d1451d4SHong Zhang            (same value is used for all local rows)
1672d1451d4SHong Zhang .  d_nnz - array containing the number of nonzeros in the various rows of the
1682d1451d4SHong Zhang            DIAGONAL portion of the local submatrix (possibly different for each row)
1692d1451d4SHong Zhang            or `NULL`, if `d_nz` is used to specify the nonzero structure.
1702d1451d4SHong Zhang            The size of this array is equal to the number of local rows, i.e `m`.
1712d1451d4SHong Zhang            For matrices you plan to factor you must leave room for the diagonal entry and
1722d1451d4SHong Zhang            put in the entry even if it is zero.
1732d1451d4SHong Zhang .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
1742d1451d4SHong Zhang            submatrix (same value is used for all local rows).
1752d1451d4SHong Zhang -  o_nnz - array containing the number of nonzeros in the various rows of the
1762d1451d4SHong Zhang            OFF-DIAGONAL portion of the local submatrix (possibly different for
1772d1451d4SHong Zhang            each row) or `NULL`, if `o_nz` is used to specify the nonzero
1782d1451d4SHong Zhang            structure. The size of this array is equal to the number
1792d1451d4SHong Zhang            of local rows, i.e `m`.
1802d1451d4SHong Zhang 
1812d1451d4SHong Zhang    Output Parameter:
1822d1451d4SHong Zhang .  A - the matrix
1832d1451d4SHong Zhang 
1842d1451d4SHong Zhang    Level: intermediate
1852d1451d4SHong Zhang 
1862d1451d4SHong Zhang    Notes:
1872d1451d4SHong Zhang    If `nnz` is given then `nz` is ignored
1882d1451d4SHong Zhang 
1892d1451d4SHong Zhang    Specify the preallocated storage with either `nz` or `nnz` (not both).
1902d1451d4SHong Zhang    Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1912d1451d4SHong Zhang    allocation.
1922d1451d4SHong Zhang 
1932d1451d4SHong Zhang .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateSELL()`, `MatSetValues()`, `MATMPISELLCUDA`, `MATSELLCUDA`
1942d1451d4SHong Zhang @*/
1952d1451d4SHong Zhang PetscErrorCode MatCreateSELLCUDA(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
1962d1451d4SHong Zhang {
1972d1451d4SHong Zhang   PetscMPIInt size;
1982d1451d4SHong Zhang 
1992d1451d4SHong Zhang   PetscFunctionBegin;
2002d1451d4SHong Zhang   PetscCall(MatCreate(comm, A));
2012d1451d4SHong Zhang   PetscCall(MatSetSizes(*A, m, n, M, N));
2022d1451d4SHong Zhang   PetscCallMPI(MPI_Comm_size(comm, &size));
2032d1451d4SHong Zhang   if (size > 1) {
2042d1451d4SHong Zhang     PetscCall(MatSetType(*A, MATMPISELLCUDA));
2052d1451d4SHong Zhang     PetscCall(MatMPISELLSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
2062d1451d4SHong Zhang   } else {
2072d1451d4SHong Zhang     PetscCall(MatSetType(*A, MATSEQSELLCUDA));
2082d1451d4SHong Zhang     PetscCall(MatSeqSELLSetPreallocation(*A, d_nz, d_nnz));
2092d1451d4SHong Zhang   }
2102d1451d4SHong Zhang   PetscFunctionReturn(PETSC_SUCCESS);
2112d1451d4SHong Zhang }
2122d1451d4SHong Zhang 
2132d1451d4SHong Zhang /*MC
2142d1451d4SHong Zhang    MATSELLCUDA - "sellcuda" = "mpisellcuda" - A matrix type to be used for sparse matrices.
2152d1451d4SHong Zhang 
2162d1451d4SHong Zhang    Sliced ELLPACK matrix type whose data resides on NVIDIA GPUs.
2172d1451d4SHong Zhang 
2182d1451d4SHong Zhang    This matrix type is identical to `MATSEQSELLCUDA` when constructed with a single process communicator,
2192d1451d4SHong Zhang    and `MATMPISELLCUDA` otherwise.  As a result, for single process communicators,
2202d1451d4SHong Zhang    `MatSeqSELLSetPreallocation()` is supported, and similarly `MatMPISELLSetPreallocation()` is supported
2212d1451d4SHong Zhang    for communicators controlling multiple processes.  It is recommended that you call both of
2222d1451d4SHong Zhang    the above preallocation routines for simplicity.
2232d1451d4SHong Zhang 
2242d1451d4SHong Zhang    Options Database Key:
2252d1451d4SHong Zhang .  -mat_type mpisellcuda - sets the matrix type to `MATMPISELLCUDA` during a call to MatSetFromOptions()
2262d1451d4SHong Zhang 
2272d1451d4SHong Zhang   Level: beginner
2282d1451d4SHong Zhang 
2292d1451d4SHong Zhang  .seealso: `MatCreateSELLCUDA()`, `MATSEQSELLCUDA`, `MatCreateSeqSELLCUDA()`, `MatCUDAFormatOperation()`
2302d1451d4SHong Zhang M*/
231