xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision cbf1f8ac1337efc26357c0f10e7bd58aa32e376a)
12327d61dSSatish Balay #include "petscconf.h"
22327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN
39ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END
5bbf3fe20SPaul Mullowney #include "mpicusparsematimpl.h"
69ae82921SPaul Mullowney 
79ae82921SPaul Mullowney #undef __FUNCT__
89ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
99ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
109ae82921SPaul Mullowney {
11bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
12bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
139ae82921SPaul Mullowney   PetscErrorCode     ierr;
149ae82921SPaul Mullowney   PetscInt           i;
159ae82921SPaul Mullowney 
169ae82921SPaul Mullowney   PetscFunctionBegin;
179ae82921SPaul Mullowney   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
189ae82921SPaul Mullowney   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
199ae82921SPaul Mullowney   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
209ae82921SPaul Mullowney   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
219ae82921SPaul Mullowney 
229ae82921SPaul Mullowney   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
239ae82921SPaul Mullowney   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
249ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
259ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
269ae82921SPaul Mullowney   if (d_nnz) {
279ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
289ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
299ae82921SPaul Mullowney     }
309ae82921SPaul Mullowney   }
319ae82921SPaul Mullowney   if (o_nnz) {
329ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
339ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
349ae82921SPaul Mullowney     }
359ae82921SPaul Mullowney   }
369ae82921SPaul Mullowney   if (!B->preallocated) {
37bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
389ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
419ae82921SPaul Mullowney     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
429ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
439ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
449ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
459ae82921SPaul Mullowney     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
469ae82921SPaul Mullowney   }
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
512205254eSKarl Rupp 
529ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
539ae82921SPaul Mullowney   PetscFunctionReturn(0);
549ae82921SPaul Mullowney }
55e057df02SPaul Mullowney 
569ae82921SPaul Mullowney #undef __FUNCT__
579ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
589ae82921SPaul Mullowney PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
599ae82921SPaul Mullowney {
609ae82921SPaul Mullowney   PetscErrorCode ierr;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   if (right) {
64ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
659ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
669ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
68*cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
699ae82921SPaul Mullowney   }
709ae82921SPaul Mullowney   if (left) {
71ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
729ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
739ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
749ae82921SPaul Mullowney     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
75*cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
76*cbf1f8acSSatish Balay 
77*cbf1f8acSSatish Balay 
789ae82921SPaul Mullowney   }
799ae82921SPaul Mullowney   PetscFunctionReturn(0);
809ae82921SPaul Mullowney }
819ae82921SPaul Mullowney 
829ae82921SPaul Mullowney 
839ae82921SPaul Mullowney #undef __FUNCT__
849ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
859ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
869ae82921SPaul Mullowney {
87e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
88e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
89e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
90e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
91e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
92e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
93e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
94e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
95e057df02SPaul Mullowney      against race conditions.
96e057df02SPaul Mullowney 
97e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
989ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
999ae82921SPaul Mullowney   PetscErrorCode ierr;
1009ae82921SPaul Mullowney   PetscInt       nt;
1019ae82921SPaul Mullowney 
1029ae82921SPaul Mullowney   PetscFunctionBegin;
1039ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1049ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1059ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1069ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1079ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1089ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1099ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1109ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1119ae82921SPaul Mullowney   PetscFunctionReturn(0);
1129ae82921SPaul Mullowney }
113ca45077fSPaul Mullowney 
114ca45077fSPaul Mullowney #undef __FUNCT__
115ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
116ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
117ca45077fSPaul Mullowney {
118e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
119e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
120e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
121e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
122e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
123e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
124e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
125e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
126e057df02SPaul Mullowney      against race conditions.
127e057df02SPaul Mullowney 
128e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
129ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
130ca45077fSPaul Mullowney   PetscErrorCode ierr;
131ca45077fSPaul Mullowney   PetscInt       nt;
132ca45077fSPaul Mullowney 
133ca45077fSPaul Mullowney   PetscFunctionBegin;
134ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
135ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
136ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
138ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
139ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
140ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
141ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
142ca45077fSPaul Mullowney   PetscFunctionReturn(0);
143ca45077fSPaul Mullowney }
1449ae82921SPaul Mullowney 
145ca45077fSPaul Mullowney EXTERN_C_BEGIN
146ca45077fSPaul Mullowney #undef __FUNCT__
147e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
148e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
149ca45077fSPaul Mullowney {
150ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
151bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
152e057df02SPaul Mullowney 
153ca45077fSPaul Mullowney   PetscFunctionBegin;
154e057df02SPaul Mullowney   switch (op) {
155e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
156e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
157045c96e1SPaul Mullowney     break;
158e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
159e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
160045c96e1SPaul Mullowney     break;
161e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
162e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
163e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
164045c96e1SPaul Mullowney     break;
165e057df02SPaul Mullowney   default:
166e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
167045c96e1SPaul Mullowney   }
168ca45077fSPaul Mullowney   PetscFunctionReturn(0);
169ca45077fSPaul Mullowney }
170ca45077fSPaul Mullowney EXTERN_C_END
171ca45077fSPaul Mullowney 
172e057df02SPaul Mullowney 
173e057df02SPaul Mullowney #undef __FUNCT__
174e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
175e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
176e057df02SPaul Mullowney {
177e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
178e057df02SPaul Mullowney   PetscErrorCode           ierr;
179e057df02SPaul Mullowney   PetscBool                flg;
1805fd66863SKarl Rupp 
181e057df02SPaul Mullowney   PetscFunctionBegin;
182e057df02SPaul Mullowney   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
183e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
184e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
185e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
1867083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187e057df02SPaul Mullowney     if (flg) {
188e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
189e057df02SPaul Mullowney     }
190e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
1917083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
192e057df02SPaul Mullowney     if (flg) {
193e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
194e057df02SPaul Mullowney     }
195e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
1967083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
197e057df02SPaul Mullowney     if (flg) {
198e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
199e057df02SPaul Mullowney     }
200e057df02SPaul Mullowney   }
201e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
202e057df02SPaul Mullowney   PetscFunctionReturn(0);
203e057df02SPaul Mullowney }
204e057df02SPaul Mullowney 
205bbf3fe20SPaul Mullowney #undef __FUNCT__
206bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
207bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
208bbf3fe20SPaul Mullowney {
209bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
210bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
211bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
212bbf3fe20SPaul Mullowney 
213bbf3fe20SPaul Mullowney   PetscFunctionBegin;
214bbf3fe20SPaul Mullowney   try {
215bbf3fe20SPaul Mullowney     delete cusparseStruct;
216bbf3fe20SPaul Mullowney   } catch(char *ex) {
217bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
218bbf3fe20SPaul Mullowney   }
219bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2202205254eSKarl Rupp 
221bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
222bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
223bbf3fe20SPaul Mullowney }
224ca45077fSPaul Mullowney 
2259ae82921SPaul Mullowney EXTERN_C_BEGIN
2269ae82921SPaul Mullowney #undef __FUNCT__
2279ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
228bbf3fe20SPaul Mullowney PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
2299ae82921SPaul Mullowney {
2309ae82921SPaul Mullowney   PetscErrorCode     ierr;
231bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
232bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
2339ae82921SPaul Mullowney 
2349ae82921SPaul Mullowney   PetscFunctionBegin;
235bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
236bbf3fe20SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
2379ae82921SPaul Mullowney                                            "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
2389ae82921SPaul Mullowney                                            MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
239bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
240bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2412205254eSKarl Rupp 
242bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
243e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
244e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
2452205254eSKarl Rupp 
246bbf3fe20SPaul Mullowney   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
247bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
248bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
249bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
250bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2512205254eSKarl Rupp 
252bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
253e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2549ae82921SPaul Mullowney   PetscFunctionReturn(0);
2559ae82921SPaul Mullowney }
2569ae82921SPaul Mullowney EXTERN_C_END
2579ae82921SPaul Mullowney 
2583f9c0db1SPaul Mullowney /*@
2593f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
260e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2613f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
262e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
263e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
264e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
265e057df02SPaul Mullowney    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
266e057df02SPaul Mullowney    to build/install PETSc to use different CUSPARSE base matrix types.
2679ae82921SPaul Mullowney 
268e057df02SPaul Mullowney    Collective on MPI_Comm
269e057df02SPaul Mullowney 
270e057df02SPaul Mullowney    Input Parameters:
271e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
272e057df02SPaul Mullowney .  m - number of rows
273e057df02SPaul Mullowney .  n - number of columns
274e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
275e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2760298fd71SBarry Smith          (possibly different for each row) or NULL
277e057df02SPaul Mullowney 
278e057df02SPaul Mullowney    Output Parameter:
279e057df02SPaul Mullowney .  A - the matrix
280e057df02SPaul Mullowney 
281e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
282e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
283e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
284e057df02SPaul Mullowney 
285e057df02SPaul Mullowney    Notes:
286e057df02SPaul Mullowney    If nnz is given then nz is ignored
287e057df02SPaul Mullowney 
288e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
289e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
290e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
291e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
292e057df02SPaul Mullowney 
293e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
2940298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
295e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
296e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
297e057df02SPaul Mullowney 
298e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
299e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
300e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
301e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
302e057df02SPaul Mullowney 
303e057df02SPaul Mullowney    Level: intermediate
304e057df02SPaul Mullowney 
305e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
306e057df02SPaul Mullowney @*/
3079ae82921SPaul Mullowney #undef __FUNCT__
3089ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3099ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3109ae82921SPaul Mullowney {
3119ae82921SPaul Mullowney   PetscErrorCode ierr;
3129ae82921SPaul Mullowney   PetscMPIInt    size;
3139ae82921SPaul Mullowney 
3149ae82921SPaul Mullowney   PetscFunctionBegin;
3159ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3169ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3179ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3189ae82921SPaul Mullowney   if (size > 1) {
3199ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3209ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   } else {
3229ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3239ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   }
3259ae82921SPaul Mullowney   PetscFunctionReturn(0);
3269ae82921SPaul Mullowney }
3279ae82921SPaul Mullowney 
3283f9c0db1SPaul Mullowney /*M
329e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
330e057df02SPaul Mullowney 
331e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
3328468deeeSKarl Rupp    All matrix calculations are performed using the Nvidia CUSPARSE library. Use of the
333e057df02SPaul Mullowney    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
334e057df02SPaul Mullowney    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
335e057df02SPaul Mullowney    GPU storage formats with CUSPARSE matrix types.
3369ae82921SPaul Mullowney 
3379ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3389ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3399ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3409ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3419ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3429ae82921SPaul Mullowney 
3439ae82921SPaul Mullowney    Options Database Keys:
344e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3458468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3468468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3478468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3489ae82921SPaul Mullowney 
3499ae82921SPaul Mullowney   Level: beginner
3509ae82921SPaul Mullowney 
3518468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3528468deeeSKarl Rupp M
3539ae82921SPaul Mullowney M*/
354