xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 53800007fd6c7b4835acc3912dc4427fee16917e)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
2*53800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
79ae82921SPaul Mullowney 
89ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
99ae82921SPaul Mullowney {
10bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
11bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
129ae82921SPaul Mullowney   PetscErrorCode     ierr;
139ae82921SPaul Mullowney   PetscInt           i;
149ae82921SPaul Mullowney 
159ae82921SPaul Mullowney   PetscFunctionBegin;
169ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
179ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
189ae82921SPaul Mullowney   if (d_nnz) {
199ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
209ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
219ae82921SPaul Mullowney     }
229ae82921SPaul Mullowney   }
239ae82921SPaul Mullowney   if (o_nnz) {
249ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
259ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
269ae82921SPaul Mullowney     }
279ae82921SPaul Mullowney   }
289ae82921SPaul Mullowney   if (!B->preallocated) {
29bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
309ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
31fdc842d1SBarry Smith     ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr);
329ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
339ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
343bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
36fdc842d1SBarry Smith     ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
389ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
393bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
409ae82921SPaul Mullowney   }
419ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
429ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
43e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
44e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
45b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
46b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
48b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
492205254eSKarl Rupp 
509ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
519ae82921SPaul Mullowney   PetscFunctionReturn(0);
529ae82921SPaul Mullowney }
53e057df02SPaul Mullowney 
549ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
559ae82921SPaul Mullowney {
56fdc842d1SBarry Smith   /*
57fdc842d1SBarry Smith      This multiplication sequence is different sequence
58e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
59e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
60e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
61e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
62e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
63e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
64e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
65e057df02SPaul Mullowney      against race conditions.
66fdc842d1SBarry Smith   */
679ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
689ae82921SPaul Mullowney   PetscErrorCode ierr;
699ae82921SPaul Mullowney   PetscInt       nt;
709ae82921SPaul Mullowney 
719ae82921SPaul Mullowney   PetscFunctionBegin;
729ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
739ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
74959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
759ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
769ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
779ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
789ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
799ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
809ae82921SPaul Mullowney   PetscFunctionReturn(0);
819ae82921SPaul Mullowney }
82ca45077fSPaul Mullowney 
83fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
84fdc842d1SBarry Smith {
85fdc842d1SBarry Smith   /*
86fdc842d1SBarry Smith      This multiplication sequence is different sequence
87fdc842d1SBarry Smith      than the CPU version. In particular, the diagonal block
88fdc842d1SBarry Smith      multiplication kernel is launched in one stream. Then,
89fdc842d1SBarry Smith      in a separate stream, the data transfers from DeviceToHost
90fdc842d1SBarry Smith      (with MPI messaging in between), then HostToDevice are
91fdc842d1SBarry Smith      launched. Once the data transfer stream is synchronized,
92fdc842d1SBarry Smith      to ensure messaging is complete, the MatMultAdd kernel
93fdc842d1SBarry Smith      is launched in the original (MatMult) stream to protect
94fdc842d1SBarry Smith      against race conditions.
95fdc842d1SBarry Smith   */
96fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
97fdc842d1SBarry Smith   PetscErrorCode ierr;
98fdc842d1SBarry Smith   PetscInt       nt;
99fdc842d1SBarry Smith 
100fdc842d1SBarry Smith   PetscFunctionBegin;
101fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
102fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
103fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
104fdc842d1SBarry Smith   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
105fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
106fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
107fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
108fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
109fdc842d1SBarry Smith   PetscFunctionReturn(0);
110fdc842d1SBarry Smith }
111fdc842d1SBarry Smith 
112ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
113ca45077fSPaul Mullowney {
114e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
115e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
116e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
117e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
118e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
119e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
120e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
121e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
122e057df02SPaul Mullowney      against race conditions.
123e057df02SPaul Mullowney 
124e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
125ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
126ca45077fSPaul Mullowney   PetscErrorCode ierr;
127ca45077fSPaul Mullowney   PetscInt       nt;
128ca45077fSPaul Mullowney 
129ca45077fSPaul Mullowney   PetscFunctionBegin;
130ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
131ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
132959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
1339b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
134ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1359b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1369b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
138ca45077fSPaul Mullowney   PetscFunctionReturn(0);
139ca45077fSPaul Mullowney }
1409ae82921SPaul Mullowney 
141e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
142ca45077fSPaul Mullowney {
143ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
144bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
145e057df02SPaul Mullowney 
146ca45077fSPaul Mullowney   PetscFunctionBegin;
147e057df02SPaul Mullowney   switch (op) {
148e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
149e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
150045c96e1SPaul Mullowney     break;
151e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
152e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
153045c96e1SPaul Mullowney     break;
154e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
155e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
156e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
157045c96e1SPaul Mullowney     break;
158e057df02SPaul Mullowney   default:
159e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
160045c96e1SPaul Mullowney   }
161ca45077fSPaul Mullowney   PetscFunctionReturn(0);
162ca45077fSPaul Mullowney }
163e057df02SPaul Mullowney 
1644416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
165e057df02SPaul Mullowney {
166e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
167e057df02SPaul Mullowney   PetscErrorCode           ierr;
168e057df02SPaul Mullowney   PetscBool                flg;
169a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
170a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1715fd66863SKarl Rupp 
172e057df02SPaul Mullowney   PetscFunctionBegin;
173e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
174e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
175e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
176a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
177e057df02SPaul Mullowney     if (flg) {
178e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
179e057df02SPaul Mullowney     }
180e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
181a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
182e057df02SPaul Mullowney     if (flg) {
183e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
184e057df02SPaul Mullowney     }
185e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
186a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
187e057df02SPaul Mullowney     if (flg) {
188e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
189e057df02SPaul Mullowney     }
190e057df02SPaul Mullowney   }
1910af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
192e057df02SPaul Mullowney   PetscFunctionReturn(0);
193e057df02SPaul Mullowney }
194e057df02SPaul Mullowney 
19534d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19634d6c7a5SJose E. Roman {
19734d6c7a5SJose E. Roman   PetscErrorCode ierr;
19834d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
19934d6c7a5SJose E. Roman 
20034d6c7a5SJose E. Roman   PetscFunctionBegin;
20134d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
20234d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20334d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20434d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20534d6c7a5SJose E. Roman   }
20634d6c7a5SJose E. Roman   PetscFunctionReturn(0);
20734d6c7a5SJose E. Roman }
20834d6c7a5SJose E. Roman 
209bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
210bbf3fe20SPaul Mullowney {
211bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
212bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
213bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
214b06137fdSPaul Mullowney   cudaError_t        err;
215b06137fdSPaul Mullowney   cusparseStatus_t   stat;
216bbf3fe20SPaul Mullowney 
217bbf3fe20SPaul Mullowney   PetscFunctionBegin;
218bbf3fe20SPaul Mullowney   try {
219b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
220b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
221c41cb2e2SAlejandro Lamas Daviña     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
222c41cb2e2SAlejandro Lamas Daviña     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
223bbf3fe20SPaul Mullowney     delete cusparseStruct;
224bbf3fe20SPaul Mullowney   } catch(char *ex) {
225bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
226bbf3fe20SPaul Mullowney   }
227bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
228bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
229bbf3fe20SPaul Mullowney }
230ca45077fSPaul Mullowney 
2318cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2329ae82921SPaul Mullowney {
2339ae82921SPaul Mullowney   PetscErrorCode     ierr;
234bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
235bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
236b06137fdSPaul Mullowney   cudaError_t        err;
237b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2389ae82921SPaul Mullowney 
2399ae82921SPaul Mullowney   PetscFunctionBegin;
240bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
241bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
24234136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
24334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
24434136279SStefano Zampini 
245bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
246bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2472205254eSKarl Rupp 
248bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
249e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
250e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
251c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
252c41cb2e2SAlejandro Lamas Daviña   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2532205254eSKarl Rupp 
25434d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
255bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
256fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
257bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
258bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
259bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2602205254eSKarl Rupp 
261bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
262bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2639ae82921SPaul Mullowney   PetscFunctionReturn(0);
2649ae82921SPaul Mullowney }
2659ae82921SPaul Mullowney 
2663f9c0db1SPaul Mullowney /*@
2673f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
268e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2693f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
270e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
271e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
272e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2739ae82921SPaul Mullowney 
274d083f849SBarry Smith    Collective
275e057df02SPaul Mullowney 
276e057df02SPaul Mullowney    Input Parameters:
277e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
278e057df02SPaul Mullowney .  m - number of rows
279e057df02SPaul Mullowney .  n - number of columns
280e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
281e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2820298fd71SBarry Smith          (possibly different for each row) or NULL
283e057df02SPaul Mullowney 
284e057df02SPaul Mullowney    Output Parameter:
285e057df02SPaul Mullowney .  A - the matrix
286e057df02SPaul Mullowney 
287e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
288e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
289e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
290e057df02SPaul Mullowney 
291e057df02SPaul Mullowney    Notes:
292e057df02SPaul Mullowney    If nnz is given then nz is ignored
293e057df02SPaul Mullowney 
294e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
295e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
296e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
297e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
298e057df02SPaul Mullowney 
299e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3000298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
301e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
302e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
303e057df02SPaul Mullowney 
304e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
305e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
306e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
307e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
308e057df02SPaul Mullowney 
309e057df02SPaul Mullowney    Level: intermediate
310e057df02SPaul Mullowney 
311e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
312e057df02SPaul Mullowney @*/
3139ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3149ae82921SPaul Mullowney {
3159ae82921SPaul Mullowney   PetscErrorCode ierr;
3169ae82921SPaul Mullowney   PetscMPIInt    size;
3179ae82921SPaul Mullowney 
3189ae82921SPaul Mullowney   PetscFunctionBegin;
3199ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3209ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3219ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3229ae82921SPaul Mullowney   if (size > 1) {
3239ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3249ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3259ae82921SPaul Mullowney   } else {
3269ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3279ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3289ae82921SPaul Mullowney   }
3299ae82921SPaul Mullowney   PetscFunctionReturn(0);
3309ae82921SPaul Mullowney }
3319ae82921SPaul Mullowney 
3323ca39a21SBarry Smith /*MC
333e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
334e057df02SPaul Mullowney 
3352692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3362692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3372692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3389ae82921SPaul Mullowney 
3399ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3409ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3419ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3429ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3439ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3449ae82921SPaul Mullowney 
3459ae82921SPaul Mullowney    Options Database Keys:
346e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3478468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3488468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3498468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3509ae82921SPaul Mullowney 
3519ae82921SPaul Mullowney   Level: beginner
3529ae82921SPaul Mullowney 
3538468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3548468deeeSKarl Rupp M
3559ae82921SPaul Mullowney M*/
356