xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 33d57670fcdbf57d9203d482728f549b81403a0e)
13d13b8fdSMatthew G. Knepley #include <petscconf.h>
22327d61dSSatish Balay PETSC_CUDA_EXTERN_C_BEGIN
39ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
42327d61dSSatish Balay PETSC_CUDA_EXTERN_C_END
53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
69ae82921SPaul Mullowney 
7b06137fdSPaul Mullowney 
89ae82921SPaul Mullowney #undef __FUNCT__
99ae82921SPaul Mullowney #define __FUNCT__ "MatMPIAIJSetPreallocation_MPIAIJCUSPARSE"
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   if (d_nz == PETSC_DEFAULT || d_nz == PETSC_DECIDE) d_nz = 5;
199ae82921SPaul Mullowney   if (o_nz == PETSC_DEFAULT || o_nz == PETSC_DECIDE) o_nz = 2;
209ae82921SPaul Mullowney   if (d_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nz cannot be less than 0: value %D",d_nz);
219ae82921SPaul Mullowney   if (o_nz < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nz cannot be less than 0: value %D",o_nz);
229ae82921SPaul Mullowney 
239ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
249ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
259ae82921SPaul Mullowney   if (d_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (o_nnz) {
319ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
329ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
339ae82921SPaul Mullowney     }
349ae82921SPaul Mullowney   }
359ae82921SPaul Mullowney   if (!B->preallocated) {
36bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
379ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
389ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
403bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
419ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
429ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
439ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
443bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
459ae82921SPaul Mullowney   }
469ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
542205254eSKarl Rupp 
559ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
569ae82921SPaul Mullowney   PetscFunctionReturn(0);
579ae82921SPaul Mullowney }
58e057df02SPaul Mullowney 
599ae82921SPaul Mullowney #undef __FUNCT__
609ae82921SPaul Mullowney #define __FUNCT__ "MatGetVecs_MPIAIJCUSPARSE"
619ae82921SPaul Mullowney PetscErrorCode  MatGetVecs_MPIAIJCUSPARSE(Mat mat,Vec *right,Vec *left)
629ae82921SPaul Mullowney {
639ae82921SPaul Mullowney   PetscErrorCode ierr;
64*33d57670SJed Brown   PetscInt rbs,cbs;
659ae82921SPaul Mullowney 
669ae82921SPaul Mullowney   PetscFunctionBegin;
67*33d57670SJed Brown   ierr = MatGetBlockSizes(mat,&rbs,&cbs);CHKERRQ(ierr);
689ae82921SPaul Mullowney   if (right) {
69ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
709ae82921SPaul Mullowney     ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
71*33d57670SJed Brown     ierr = VecSetBlockSize(*right,cbs);CHKERRQ(ierr);
729ae82921SPaul Mullowney     ierr = VecSetType(*right,VECCUSP);CHKERRQ(ierr);
73cbf1f8acSSatish Balay     ierr = VecSetLayout(*right,mat->cmap);CHKERRQ(ierr);
749ae82921SPaul Mullowney   }
759ae82921SPaul Mullowney   if (left) {
76ce94432eSBarry Smith     ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
779ae82921SPaul Mullowney     ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
78*33d57670SJed Brown     ierr = VecSetBlockSize(*left,rbs);CHKERRQ(ierr);
799ae82921SPaul Mullowney     ierr = VecSetType(*left,VECCUSP);CHKERRQ(ierr);
80cbf1f8acSSatish Balay     ierr = VecSetLayout(*left,mat->rmap);CHKERRQ(ierr);
81cbf1f8acSSatish Balay 
82cbf1f8acSSatish Balay 
839ae82921SPaul Mullowney   }
849ae82921SPaul Mullowney   PetscFunctionReturn(0);
859ae82921SPaul Mullowney }
869ae82921SPaul Mullowney 
879ae82921SPaul Mullowney 
889ae82921SPaul Mullowney #undef __FUNCT__
899ae82921SPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
909ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
919ae82921SPaul Mullowney {
92e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
93e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
94e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
95e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
96e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
97e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
98e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
99e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
100e057df02SPaul Mullowney      against race conditions.
101e057df02SPaul Mullowney 
102e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
1039ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
1049ae82921SPaul Mullowney   PetscErrorCode ierr;
1059ae82921SPaul Mullowney   PetscInt       nt;
1069ae82921SPaul Mullowney 
1079ae82921SPaul Mullowney   PetscFunctionBegin;
1089ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
1099ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
1109ae82921SPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
1119ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
1129ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1139ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1149ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
1159ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
1169ae82921SPaul Mullowney   PetscFunctionReturn(0);
1179ae82921SPaul Mullowney }
118ca45077fSPaul Mullowney 
119ca45077fSPaul Mullowney #undef __FUNCT__
120ca45077fSPaul Mullowney #define __FUNCT__ "MatMult_MPIAIJCUSPARSE"
121ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
122ca45077fSPaul Mullowney {
123e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
124e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
125e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
126e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
127e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
128e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
129e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
130e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
131e057df02SPaul Mullowney      against race conditions.
132e057df02SPaul Mullowney 
133e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
134ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
135ca45077fSPaul Mullowney   PetscErrorCode ierr;
136ca45077fSPaul Mullowney   PetscInt       nt;
137ca45077fSPaul Mullowney 
138ca45077fSPaul Mullowney   PetscFunctionBegin;
139ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
140ca45077fSPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
141ca45077fSPaul Mullowney   ierr = VecScatterInitializeForGPU(a->Mvctx,xx,SCATTER_FORWARD);CHKERRQ(ierr);
142ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
143ca45077fSPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
144ca45077fSPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
145ca45077fSPaul Mullowney   ierr = (*a->B->ops->multtransposeadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
146ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
147ca45077fSPaul Mullowney   PetscFunctionReturn(0);
148ca45077fSPaul Mullowney }
1499ae82921SPaul Mullowney 
150ca45077fSPaul Mullowney #undef __FUNCT__
151e057df02SPaul Mullowney #define __FUNCT__ "MatCUSPARSESetFormat_MPIAIJCUSPARSE"
152e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
153ca45077fSPaul Mullowney {
154ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
155bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
156e057df02SPaul Mullowney 
157ca45077fSPaul Mullowney   PetscFunctionBegin;
158e057df02SPaul Mullowney   switch (op) {
159e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
160e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
161045c96e1SPaul Mullowney     break;
162e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
163e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
164045c96e1SPaul Mullowney     break;
165e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
166e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
167e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
168045c96e1SPaul Mullowney     break;
169e057df02SPaul Mullowney   default:
170e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
171045c96e1SPaul Mullowney   }
172ca45077fSPaul Mullowney   PetscFunctionReturn(0);
173ca45077fSPaul Mullowney }
174e057df02SPaul Mullowney 
175e057df02SPaul Mullowney #undef __FUNCT__
176e057df02SPaul Mullowney #define __FUNCT__ "MatSetFromOptions_MPIAIJCUSPARSE"
177e057df02SPaul Mullowney PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A)
178e057df02SPaul Mullowney {
179e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
180e057df02SPaul Mullowney   PetscErrorCode           ierr;
181e057df02SPaul Mullowney   PetscBool                flg;
1825fd66863SKarl Rupp 
183e057df02SPaul Mullowney   PetscFunctionBegin;
184e057df02SPaul Mullowney   ierr = PetscOptionsHead("MPIAIJCUSPARSE options");CHKERRQ(ierr);
185e057df02SPaul Mullowney   ierr = PetscObjectOptionsBegin((PetscObject)A);
186e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
187e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
1887083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
189e057df02SPaul Mullowney     if (flg) {
190e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
191e057df02SPaul Mullowney     }
192e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
1937083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
194e057df02SPaul Mullowney     if (flg) {
195e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
196e057df02SPaul Mullowney     }
197e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
1987083f85cSSatish Balay                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)MAT_CUSPARSE_CSR,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
199e057df02SPaul Mullowney     if (flg) {
200e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
201e057df02SPaul Mullowney     }
202e057df02SPaul Mullowney   }
203e057df02SPaul Mullowney   ierr = PetscOptionsEnd();CHKERRQ(ierr);
204e057df02SPaul Mullowney   PetscFunctionReturn(0);
205e057df02SPaul Mullowney }
206e057df02SPaul Mullowney 
207bbf3fe20SPaul Mullowney #undef __FUNCT__
208bbf3fe20SPaul Mullowney #define __FUNCT__ "MatDestroy_MPIAIJCUSPARSE"
209bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
210bbf3fe20SPaul Mullowney {
211bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
212bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
213bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
214b06137fdSPaul Mullowney   cudaError_t        err;
215b06137fdSPaul Mullowney   cusparseStatus_t   stat;
216bbf3fe20SPaul Mullowney 
217bbf3fe20SPaul Mullowney   PetscFunctionBegin;
218bbf3fe20SPaul Mullowney   try {
219b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
220b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
221b06137fdSPaul Mullowney     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSP(stat);
222b06137fdSPaul Mullowney     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUSP(err);
223bbf3fe20SPaul Mullowney     delete cusparseStruct;
224bbf3fe20SPaul Mullowney   } catch(char *ex) {
225bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
226bbf3fe20SPaul Mullowney   }
227bbf3fe20SPaul Mullowney   cusparseStruct = 0;
2282205254eSKarl Rupp 
229bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
230bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
231bbf3fe20SPaul Mullowney }
232ca45077fSPaul Mullowney 
2339ae82921SPaul Mullowney #undef __FUNCT__
2349ae82921SPaul Mullowney #define __FUNCT__ "MatCreate_MPIAIJCUSPARSE"
2358cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2369ae82921SPaul Mullowney {
2379ae82921SPaul Mullowney   PetscErrorCode     ierr;
238bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
239bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
240b06137fdSPaul Mullowney   cudaError_t        err;
241b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2429ae82921SPaul Mullowney 
2439ae82921SPaul Mullowney   PetscFunctionBegin;
244bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
245bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
246bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
247bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2482205254eSKarl Rupp 
249bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
250e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
251e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
252b06137fdSPaul Mullowney   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSP(stat);
253b06137fdSPaul Mullowney   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUSP(err);
2542205254eSKarl Rupp 
255bbf3fe20SPaul Mullowney   A->ops->getvecs        = MatGetVecs_MPIAIJCUSPARSE;
256bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
257bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
258bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
259bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2602205254eSKarl Rupp 
261bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
262bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2639ae82921SPaul Mullowney   PetscFunctionReturn(0);
2649ae82921SPaul Mullowney }
2659ae82921SPaul Mullowney 
2663f9c0db1SPaul Mullowney /*@
2673f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
268e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2693f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
270e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
271e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
272e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2739ae82921SPaul Mullowney 
274e057df02SPaul Mullowney    Collective on MPI_Comm
275e057df02SPaul Mullowney 
276e057df02SPaul Mullowney    Input Parameters:
277e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
278e057df02SPaul Mullowney .  m - number of rows
279e057df02SPaul Mullowney .  n - number of columns
280e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
281e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2820298fd71SBarry Smith          (possibly different for each row) or NULL
283e057df02SPaul Mullowney 
284e057df02SPaul Mullowney    Output Parameter:
285e057df02SPaul Mullowney .  A - the matrix
286e057df02SPaul Mullowney 
287e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
288e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
289e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
290e057df02SPaul Mullowney 
291e057df02SPaul Mullowney    Notes:
292e057df02SPaul Mullowney    If nnz is given then nz is ignored
293e057df02SPaul Mullowney 
294e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
295e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
296e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
297e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
298e057df02SPaul Mullowney 
299e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3000298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
301e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
302e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
303e057df02SPaul Mullowney 
304e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
305e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
306e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
307e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
308e057df02SPaul Mullowney 
309e057df02SPaul Mullowney    Level: intermediate
310e057df02SPaul Mullowney 
311e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
312e057df02SPaul Mullowney @*/
3139ae82921SPaul Mullowney #undef __FUNCT__
3149ae82921SPaul Mullowney #define __FUNCT__ "MatCreateAIJCUSPARSE"
3159ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3169ae82921SPaul Mullowney {
3179ae82921SPaul Mullowney   PetscErrorCode ierr;
3189ae82921SPaul Mullowney   PetscMPIInt    size;
3199ae82921SPaul Mullowney 
3209ae82921SPaul Mullowney   PetscFunctionBegin;
3219ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3229ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3239ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3249ae82921SPaul Mullowney   if (size > 1) {
3259ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3269ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3279ae82921SPaul Mullowney   } else {
3289ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3299ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3309ae82921SPaul Mullowney   }
3319ae82921SPaul Mullowney   PetscFunctionReturn(0);
3329ae82921SPaul Mullowney }
3339ae82921SPaul Mullowney 
3343f9c0db1SPaul Mullowney /*M
335e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
336e057df02SPaul Mullowney 
3372692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3382692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3392692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3409ae82921SPaul Mullowney 
3419ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3429ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3439ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3449ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3459ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3469ae82921SPaul Mullowney 
3479ae82921SPaul Mullowney    Options Database Keys:
348e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3498468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3508468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3518468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3529ae82921SPaul Mullowney 
3539ae82921SPaul Mullowney   Level: beginner
3549ae82921SPaul Mullowney 
3558468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3568468deeeSKarl Rupp M
3579ae82921SPaul Mullowney M*/
358