xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision e057df023a5f66cc14e65b1f031598b072b0c988)
12327d61dSSatish Balay #include "petscconf.h"
22327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN
39ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END
5bbf3fe20SPaul Mullowney #include "mpicusparsematimpl.h"
69ae82921SPaul Mullowney 
79ae82921SPaul Mullowney #undef __FUNCT__
89ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
99ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
109ae82921SPaul Mullowney {
11bbf3fe20SPaul Mullowney   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
12bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
139ae82921SPaul Mullowney   PetscErrorCode ierr;
149ae82921SPaul Mullowney   PetscInt       i;
159ae82921SPaul Mullowney 
169ae82921SPaul Mullowney   PetscFunctionBegin;
179ae82921SPaul Mullowney   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
189ae82921SPaul Mullowney   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
199ae82921SPaul Mullowney   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
209ae82921SPaul Mullowney   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
219ae82921SPaul Mullowney 
229ae82921SPaul Mullowney   ierr = PetscLayoutSetBlockSize(B->rmap,1);CHKERRQ(ierr);
239ae82921SPaul Mullowney   ierr = PetscLayoutSetBlockSize(B->cmap,1);CHKERRQ(ierr);
249ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
259ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
269ae82921SPaul Mullowney   if (d_nnz) {
279ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
289ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
299ae82921SPaul Mullowney     }
309ae82921SPaul Mullowney   }
319ae82921SPaul Mullowney   if (o_nnz) {
329ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
339ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
349ae82921SPaul Mullowney     }
359ae82921SPaul Mullowney   }
369ae82921SPaul Mullowney   if (!B->preallocated) {
37bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
389ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
419ae82921SPaul Mullowney     ierr = PetscLogObjectParent(B,b->A);CHKERRQ(ierr);
429ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
439ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
449ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
459ae82921SPaul Mullowney     ierr = PetscLogObjectParent(B,b->B);CHKERRQ(ierr);
469ae82921SPaul Mullowney   }
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
49*e057df02SPaul Mullowney   ierr=MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
50*e057df02SPaul Mullowney   ierr=MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
519ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
529ae82921SPaul Mullowney   PetscFunctionReturn(0);
539ae82921SPaul Mullowney }
54*e057df02SPaul Mullowney 
559ae82921SPaul Mullowney #undef __FUNCT__
569ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
579ae82921SPaul Mullowney PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
589ae82921SPaul Mullowney {
599ae82921SPaul Mullowney   PetscErrorCode ierr;
609ae82921SPaul Mullowney 
619ae82921SPaul Mullowney   PetscFunctionBegin;
629ae82921SPaul Mullowney   if (right) {
639ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,right);CHKERRQ(ierr);
649ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
659ae82921SPaul Mullowney     ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
669ae82921SPaul Mullowney     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
679ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
689ae82921SPaul Mullowney   }
699ae82921SPaul Mullowney   if (left) {
709ae82921SPaul Mullowney     ierr = VecCreate(((PetscObject)mat)->comm,left);CHKERRQ(ierr);
719ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
729ae82921SPaul Mullowney     ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
739ae82921SPaul Mullowney     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
749ae82921SPaul Mullowney     ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
759ae82921SPaul Mullowney   }
769ae82921SPaul Mullowney   PetscFunctionReturn(0);
779ae82921SPaul Mullowney }
789ae82921SPaul Mullowney 
799ae82921SPaul Mullowney 
809ae82921SPaul Mullowney #undef __FUNCT__
819ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
829ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
839ae82921SPaul Mullowney {
84*e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
85*e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
86*e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
87*e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
88*e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
89*e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
90*e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
91*e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
92*e057df02SPaul Mullowney      against race conditions.
93*e057df02SPaul Mullowney 
94*e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
959ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
969ae82921SPaul Mullowney   PetscErrorCode ierr;
979ae82921SPaul Mullowney   PetscInt       nt;
989ae82921SPaul Mullowney 
999ae82921SPaul Mullowney   PetscFunctionBegin;
1009ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1019ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1029ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1039ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1049ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1059ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1069ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1079ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1089ae82921SPaul Mullowney   PetscFunctionReturn(0);
1099ae82921SPaul Mullowney }
110ca45077fSPaul Mullowney 
111ca45077fSPaul Mullowney #undef __FUNCT__
112ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
113ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
114ca45077fSPaul Mullowney {
115*e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
116*e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
117*e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
118*e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
119*e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
120*e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
121*e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
122*e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
123*e057df02SPaul Mullowney      against race conditions.
124*e057df02SPaul Mullowney 
125*e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
126ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
127ca45077fSPaul Mullowney   PetscErrorCode ierr;
128ca45077fSPaul Mullowney   PetscInt       nt;
129ca45077fSPaul Mullowney 
130ca45077fSPaul Mullowney   PetscFunctionBegin;
131ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
132ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
133ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
134ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
135ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
138ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
139ca45077fSPaul Mullowney   PetscFunctionReturn(0);
140ca45077fSPaul Mullowney }
1419ae82921SPaul Mullowney 
142*e057df02SPaul Mullowney /*PetscErrorCode MatSetValuesBatch_MPIAIJCUSPARSE(Mat J, PetscInt Ne, PetscInt Nl, PetscInt *elemRows, const PetscScalar *elemMats); */
143ca45077fSPaul Mullowney 
144ca45077fSPaul Mullowney EXTERN_C_BEGIN
145ca45077fSPaul Mullowney #undef __FUNCT__
146*e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
147*e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
148ca45077fSPaul Mullowney {
149ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
150bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
151*e057df02SPaul Mullowney 
152ca45077fSPaul Mullowney   PetscFunctionBegin;
153*e057df02SPaul Mullowney   switch (op) {
154*e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
155*e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
156045c96e1SPaul Mullowney     break;
157*e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
158*e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
159045c96e1SPaul Mullowney     break;
160*e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
161*e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
162*e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
163045c96e1SPaul Mullowney     break;
164*e057df02SPaul Mullowney   default:
165*e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
166045c96e1SPaul Mullowney   }
167ca45077fSPaul Mullowney   PetscFunctionReturn(0);
168ca45077fSPaul Mullowney }
169ca45077fSPaul Mullowney EXTERN_C_END
170ca45077fSPaul Mullowney 
171*e057df02SPaul Mullowney 
172*e057df02SPaul Mullowney #undef __FUNCT__
173*e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
174*e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
175*e057df02SPaul Mullowney {
176*e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
177*e057df02SPaul Mullowney   PetscErrorCode     ierr;
178*e057df02SPaul Mullowney   PetscBool      flg;
179*e057df02SPaul Mullowney   PetscFunctionBegin;
180*e057df02SPaul Mullowney   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
181*e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
182*e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
183*e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
184*e057df02SPaul Mullowney 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
185*e057df02SPaul Mullowney     if (flg) {
186*e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
187*e057df02SPaul Mullowney     }
188*e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
189*e057df02SPaul Mullowney 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
190*e057df02SPaul Mullowney     if (flg) {
191*e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
192*e057df02SPaul Mullowney     }
193*e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
194*e057df02SPaul Mullowney 			    "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)format,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
195*e057df02SPaul Mullowney     if (flg) {
196*e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
197*e057df02SPaul Mullowney     }
198*e057df02SPaul Mullowney   }
199*e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
200*e057df02SPaul Mullowney   PetscFunctionReturn(0);
201*e057df02SPaul Mullowney }
202*e057df02SPaul Mullowney 
203bbf3fe20SPaul Mullowney #undef __FUNCT__
204bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
205bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
206bbf3fe20SPaul Mullowney {
207bbf3fe20SPaul Mullowney   PetscErrorCode ierr;
208bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a  = (Mat_MPIAIJ*)A->data;
209bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
210bbf3fe20SPaul Mullowney 
211bbf3fe20SPaul Mullowney   PetscFunctionBegin;
212bbf3fe20SPaul Mullowney   try {
213bbf3fe20SPaul Mullowney     delete cusparseStruct;
214bbf3fe20SPaul Mullowney   } catch(char* ex) {
215bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
216bbf3fe20SPaul Mullowney   }
217bbf3fe20SPaul Mullowney   cusparseStruct = 0;
218bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
219bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
220bbf3fe20SPaul Mullowney }
221ca45077fSPaul Mullowney 
2229ae82921SPaul Mullowney EXTERN_C_BEGIN
2239ae82921SPaul Mullowney #undef __FUNCT__
2249ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
225bbf3fe20SPaul Mullowney PetscErrorCode  MatCreate_MPIAIJCUSPARSE(Mat A)
2269ae82921SPaul Mullowney {
2279ae82921SPaul Mullowney   PetscErrorCode ierr;
228bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
229bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
2309ae82921SPaul Mullowney 
2319ae82921SPaul Mullowney   PetscFunctionBegin;
232bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
233bbf3fe20SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatMPIAIJSetPreallocation_C",
2349ae82921SPaul Mullowney 					   "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE",
2359ae82921SPaul Mullowney 					   MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
236bbf3fe20SPaul Mullowney   a  = (Mat_MPIAIJ*)A->data;
237bbf3fe20SPaul Mullowney   a->spptr                      = new Mat_MPIAIJCUSPARSE;
238bbf3fe20SPaul Mullowney   cusparseStruct  = (Mat_MPIAIJCUSPARSE*)a->spptr;
239*e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
240*e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
241bbf3fe20SPaul Mullowney   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
242bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
243bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
244bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
245bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
246bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
247*e057df02SPaul Mullowney   ierr = PetscObjectComposeFunctionDynamic((PetscObject)A,"MatCUSPARSESetFormat_C", "MatCUSPARSESetFormat_MPIAIJCUSPARSE", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
248*e057df02SPaul Mullowney 
2499ae82921SPaul Mullowney   PetscFunctionReturn(0);
2509ae82921SPaul Mullowney }
2519ae82921SPaul Mullowney EXTERN_C_END
2529ae82921SPaul Mullowney 
253*e057df02SPaul Mullowney /*@C
254*e057df02SPaul Mullowney    MatCreateAIJCUSP - Creates a sparse matrix in AIJ (compressed row) format
255*e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
256*e057df02SPaul Mullowney    to NVidia GPUs and use the CUSPARSe library for calculations. For good matrix
257*e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
258*e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
259*e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
260*e057df02SPaul Mullowney    This type is only available when using the 'txpetscgpu' package. Use --download-txpetscgpu
261*e057df02SPaul Mullowney    to build/install PETSc to use different CUSPARSE base matrix types.
2629ae82921SPaul Mullowney 
263*e057df02SPaul Mullowney    Collective on MPI_Comm
264*e057df02SPaul Mullowney 
265*e057df02SPaul Mullowney    Input Parameters:
266*e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
267*e057df02SPaul Mullowney .  m - number of rows
268*e057df02SPaul Mullowney .  n - number of columns
269*e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
270*e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
271*e057df02SPaul Mullowney          (possibly different for each row) or PETSC_NULL
272*e057df02SPaul Mullowney 
273*e057df02SPaul Mullowney    Output Parameter:
274*e057df02SPaul Mullowney .  A - the matrix
275*e057df02SPaul Mullowney 
276*e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
277*e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
278*e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
279*e057df02SPaul Mullowney 
280*e057df02SPaul Mullowney    Notes:
281*e057df02SPaul Mullowney    If nnz is given then nz is ignored
282*e057df02SPaul Mullowney 
283*e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
284*e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
285*e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
286*e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
287*e057df02SPaul Mullowney 
288*e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
289*e057df02SPaul Mullowney    Set nz=PETSC_DEFAULT and nnz=PETSC_NULL for PETSc to control dynamic memory
290*e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
291*e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
292*e057df02SPaul Mullowney 
293*e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
294*e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
295*e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
296*e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
297*e057df02SPaul Mullowney 
298*e057df02SPaul Mullowney    Level: intermediate
299*e057df02SPaul Mullowney 
300*e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
301*e057df02SPaul Mullowney @*/
3029ae82921SPaul Mullowney #undef __FUNCT__
3039ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3049ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3059ae82921SPaul Mullowney {
3069ae82921SPaul Mullowney   PetscErrorCode ierr;
3079ae82921SPaul Mullowney   PetscMPIInt    size;
3089ae82921SPaul Mullowney 
3099ae82921SPaul Mullowney   PetscFunctionBegin;
3109ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3119ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3129ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3139ae82921SPaul Mullowney   if (size > 1) {
3149ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3159ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3169ae82921SPaul Mullowney   } else {
3179ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3189ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3199ae82921SPaul Mullowney   }
3209ae82921SPaul Mullowney   PetscFunctionReturn(0);
3219ae82921SPaul Mullowney }
3229ae82921SPaul Mullowney 
3239ae82921SPaul Mullowney /*MC
324*e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
325*e057df02SPaul Mullowney 
326*e057df02SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in CSR format.
327*e057df02SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. Use of the
328*e057df02SPaul Mullowney    CUSPARSE library REQUIRES the 'txpetscgpu' package. ELL and HYB formats are also available
329*e057df02SPaul Mullowney    in the txpetscgpu package. Use --download-txpetscgpu to build/install PETSc to use different
330*e057df02SPaul Mullowney    GPU storage formats with CUSPARSE matrix types.
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3339ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3349ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3359ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3369ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3379ae82921SPaul Mullowney 
3389ae82921SPaul Mullowney    Options Database Keys:
339*e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
340*e057df02SPaul Mullowney .  -mat_cusparse_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions().
341*e057df02SPaul Mullowney .  -mat_cusparse_mult_diag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of diagonal matrix during a call to MatSetFromOptions().
342*e057df02SPaul Mullowney -  -mat_cusparse_mult_offdiag_storage_format csr (ell (ellpack) or hyb (hybrid)) sets the storage format of off-diagonal matrix during a call to MatSetFromOptions().
3439ae82921SPaul Mullowney 
3449ae82921SPaul Mullowney   Level: beginner
3459ae82921SPaul Mullowney 
346*e057df02SPaul Mullowney .seealso: MatCreateMPIAIJ,MATSEQAIJ,MATMPIAIJ, MATMPIAIJCUSPARSE, MATSEQAIJCUSPARSE, MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3479ae82921SPaul Mullowney M*/
348