xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 8c34d3f5866e31b8a82117fb2eb7c9541631c9b0)
10fd2b57fSKarl Rupp #define PETSC_SKIP_COMPLEX
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
62327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
89ae82921SPaul Mullowney 
9b06137fdSPaul Mullowney 
109ae82921SPaul Mullowney #undef __FUNCT__
119ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
129ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
139ae82921SPaul Mullowney {
14bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
15bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
169ae82921SPaul Mullowney   PetscErrorCode     ierr;
179ae82921SPaul Mullowney   PetscInt           i;
189ae82921SPaul Mullowney 
199ae82921SPaul Mullowney   PetscFunctionBegin;
209ae82921SPaul Mullowney   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
219ae82921SPaul Mullowney   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
229ae82921SPaul Mullowney   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
239ae82921SPaul Mullowney   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
249ae82921SPaul Mullowney 
259ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
269ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
279ae82921SPaul Mullowney   if (d_nnz) {
289ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
299ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
309ae82921SPaul Mullowney     }
319ae82921SPaul Mullowney   }
329ae82921SPaul Mullowney   if (o_nnz) {
339ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
349ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
359ae82921SPaul Mullowney     }
369ae82921SPaul Mullowney   }
379ae82921SPaul Mullowney   if (!B->preallocated) {
38bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
399ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
419ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
423bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
439ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
449ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
459ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
463bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
479ae82921SPaul Mullowney   }
489ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
499ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
50e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
51e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
52b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
53b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
54b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
55b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
562205254eSKarl Rupp 
579ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
589ae82921SPaul Mullowney   PetscFunctionReturn(0);
599ae82921SPaul Mullowney }
60e057df02SPaul Mullowney 
619ae82921SPaul Mullowney #undef __FUNCT__
622a7a6963SBarry Smith #define __FUNCT__ "MatCreateVecs_MPIAIJCUSPARSE"
632a7a6963SBarry Smith PetscErrorCode  MatCreateVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
649ae82921SPaul Mullowney {
659ae82921SPaul Mullowney   PetscErrorCode ierr;
6633d57670SJed Brown   PetscInt rbs,cbs;
679ae82921SPaul Mullowney 
689ae82921SPaul Mullowney   PetscFunctionBegin;
6933d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
709ae82921SPaul Mullowney   if (right) {
71ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
729ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
7333d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
749ae82921SPaul Mullowney     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
75cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
769ae82921SPaul Mullowney   }
779ae82921SPaul Mullowney   if (left) {
78ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
799ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
8033d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
819ae82921SPaul Mullowney     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
82cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
83cbf1f8acSSatish Balay 
84cbf1f8acSSatish Balay 
859ae82921SPaul Mullowney   }
869ae82921SPaul Mullowney   PetscFunctionReturn(0);
879ae82921SPaul Mullowney }
889ae82921SPaul Mullowney 
899ae82921SPaul Mullowney 
909ae82921SPaul Mullowney #undef __FUNCT__
919ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
929ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
939ae82921SPaul Mullowney {
94e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
95e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
96e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
97e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
98e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
99e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
100e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
101e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
102e057df02SPaul Mullowney      against race conditions.
103e057df02SPaul Mullowney 
104e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
1059ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1069ae82921SPaul Mullowney   PetscErrorCode ierr;
1079ae82921SPaul Mullowney   PetscInt       nt;
1089ae82921SPaul Mullowney 
1099ae82921SPaul Mullowney   PetscFunctionBegin;
1109ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1119ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1129ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1139ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1149ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1159ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1169ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1179ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1189ae82921SPaul Mullowney   PetscFunctionReturn(0);
1199ae82921SPaul Mullowney }
120ca45077fSPaul Mullowney 
121ca45077fSPaul Mullowney #undef __FUNCT__
122ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
123ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
124ca45077fSPaul Mullowney {
125e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
126e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
127e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
128e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
129e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
130e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
131e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
132e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
133e057df02SPaul Mullowney      against race conditions.
134e057df02SPaul Mullowney 
135e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
136ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
137ca45077fSPaul Mullowney   PetscErrorCode ierr;
138ca45077fSPaul Mullowney   PetscInt       nt;
139ca45077fSPaul Mullowney 
140ca45077fSPaul Mullowney   PetscFunctionBegin;
141ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
142ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
143ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
144ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
145ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
146ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
147ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
148ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
149ca45077fSPaul Mullowney   PetscFunctionReturn(0);
150ca45077fSPaul Mullowney }
1519ae82921SPaul Mullowney 
152ca45077fSPaul Mullowney #undef __FUNCT__
153e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
154e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
155ca45077fSPaul Mullowney {
156ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
157bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
158e057df02SPaul Mullowney 
159ca45077fSPaul Mullowney   PetscFunctionBegin;
160e057df02SPaul Mullowney   switch (op) {
161e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
162e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
163045c96e1SPaul Mullowney     break;
164e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
165e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
166045c96e1SPaul Mullowney     break;
167e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
168e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
169e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
170045c96e1SPaul Mullowney     break;
171e057df02SPaul Mullowney   default:
172e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
173045c96e1SPaul Mullowney   }
174ca45077fSPaul Mullowney   PetscFunctionReturn(0);
175ca45077fSPaul Mullowney }
176e057df02SPaul Mullowney 
177e057df02SPaul Mullowney #undef __FUNCT__
178e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
179*8c34d3f5SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptions *PetscOptionsObject,Mat A)
180e057df02SPaul Mullowney {
181e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
182e057df02SPaul Mullowney   PetscErrorCode           ierr;
183e057df02SPaul Mullowney   PetscBool                flg;
184a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
185a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1865fd66863SKarl Rupp 
187e057df02SPaul Mullowney   PetscFunctionBegin;
188e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
189e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
190e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
191e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
192a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
193e057df02SPaul Mullowney     if (flg) {
194e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
195e057df02SPaul Mullowney     }
196e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
197a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
198e057df02SPaul Mullowney     if (flg) {
199e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
200e057df02SPaul Mullowney     }
201e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
202a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
203e057df02SPaul Mullowney     if (flg) {
204e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
205e057df02SPaul Mullowney     }
206e057df02SPaul Mullowney   }
207e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
208e057df02SPaul Mullowney   PetscFunctionReturn(0);
209e057df02SPaul Mullowney }
210e057df02SPaul Mullowney 
211bbf3fe20SPaul Mullowney #undef __FUNCT__
212bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
213bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
214bbf3fe20SPaul Mullowney {
215bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
216bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
217bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
218b06137fdSPaul Mullowney   cudaError_t        err;
219b06137fdSPaul Mullowney   cusparseStatus_t   stat;
220bbf3fe20SPaul Mullowney 
221bbf3fe20SPaul Mullowney   PetscFunctionBegin;
222bbf3fe20SPaul Mullowney   try {
223b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
224b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
225b06137fdSPaul Mullowney     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
226b06137fdSPaul Mullowney     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
227bbf3fe20SPaul Mullowney     delete cusparseStruct;
228bbf3fe20SPaul Mullowney   } catch(char *ex) {
229bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
230bbf3fe20SPaul Mullowney   }
231bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2322205254eSKarl Rupp 
233bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
234bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
235bbf3fe20SPaul Mullowney }
236ca45077fSPaul Mullowney 
2379ae82921SPaul Mullowney #undef __FUNCT__
2389ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
2398cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2409ae82921SPaul Mullowney {
2419ae82921SPaul Mullowney   PetscErrorCode     ierr;
242bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
243bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
244b06137fdSPaul Mullowney   cudaError_t        err;
245b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2469ae82921SPaul Mullowney 
2479ae82921SPaul Mullowney   PetscFunctionBegin;
248bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
249bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
250bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
251bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2522205254eSKarl Rupp 
253bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
254e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
255e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
256b06137fdSPaul Mullowney   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
257b06137fdSPaul Mullowney   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
2582205254eSKarl Rupp 
2592a7a6963SBarry Smith   A->ops->getvecs        = MatCreateVecs_MPIAIJCUSPARSE;
260bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
261bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
262bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
263bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2642205254eSKarl Rupp 
265bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
266bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2679ae82921SPaul Mullowney   PetscFunctionReturn(0);
2689ae82921SPaul Mullowney }
2699ae82921SPaul Mullowney 
2703f9c0db1SPaul Mullowney /*@
2713f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
272e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2733f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
274e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
275e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
276e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2779ae82921SPaul Mullowney 
278e057df02SPaul Mullowney    Collective on MPI_Comm
279e057df02SPaul Mullowney 
280e057df02SPaul Mullowney    Input Parameters:
281e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
282e057df02SPaul Mullowney .  m - number of rows
283e057df02SPaul Mullowney .  n - number of columns
284e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
285e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2860298fd71SBarry Smith          (possibly different for each row) or NULL
287e057df02SPaul Mullowney 
288e057df02SPaul Mullowney    Output Parameter:
289e057df02SPaul Mullowney .  A - the matrix
290e057df02SPaul Mullowney 
291e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
292e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
293e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
294e057df02SPaul Mullowney 
295e057df02SPaul Mullowney    Notes:
296e057df02SPaul Mullowney    If nnz is given then nz is ignored
297e057df02SPaul Mullowney 
298e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
299e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
300e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
301e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
302e057df02SPaul Mullowney 
303e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3040298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
305e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
306e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
307e057df02SPaul Mullowney 
308e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
309e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
310e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
311e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
312e057df02SPaul Mullowney 
313e057df02SPaul Mullowney    Level: intermediate
314e057df02SPaul Mullowney 
315e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
316e057df02SPaul Mullowney @*/
3179ae82921SPaul Mullowney #undef __FUNCT__
3189ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3199ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3209ae82921SPaul Mullowney {
3219ae82921SPaul Mullowney   PetscErrorCode ierr;
3229ae82921SPaul Mullowney   PetscMPIInt    size;
3239ae82921SPaul Mullowney 
3249ae82921SPaul Mullowney   PetscFunctionBegin;
3259ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3269ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3279ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3289ae82921SPaul Mullowney   if (size > 1) {
3299ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3309ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3319ae82921SPaul Mullowney   } else {
3329ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3339ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3349ae82921SPaul Mullowney   }
3359ae82921SPaul Mullowney   PetscFunctionReturn(0);
3369ae82921SPaul Mullowney }
3379ae82921SPaul Mullowney 
3383f9c0db1SPaul Mullowney /*M
339e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
340e057df02SPaul Mullowney 
3412692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3422692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3432692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3469ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3479ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3489ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3499ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3509ae82921SPaul Mullowney 
3519ae82921SPaul Mullowney    Options Database Keys:
352e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3538468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3548468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3558468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3569ae82921SPaul Mullowney 
3579ae82921SPaul Mullowney   Level: beginner
3589ae82921SPaul Mullowney 
3598468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3608468deeeSKarl Rupp M
3619ae82921SPaul Mullowney M*/
362