xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision d083f849a86f1f43e18d534ee43954e2786cb29a)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
53d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
69ae82921SPaul Mullowney 
79ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
89ae82921SPaul Mullowney {
9bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
10bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
119ae82921SPaul Mullowney   PetscErrorCode     ierr;
129ae82921SPaul Mullowney   PetscInt           i;
139ae82921SPaul Mullowney 
149ae82921SPaul Mullowney   PetscFunctionBegin;
159ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
169ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
179ae82921SPaul Mullowney   if (d_nnz) {
189ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
199ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
209ae82921SPaul Mullowney     }
219ae82921SPaul Mullowney   }
229ae82921SPaul Mullowney   if (o_nnz) {
239ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
249ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
259ae82921SPaul Mullowney     }
269ae82921SPaul Mullowney   }
279ae82921SPaul Mullowney   if (!B->preallocated) {
28bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
299ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
309ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
319ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
323bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
339ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
363bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
379ae82921SPaul Mullowney   }
389ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
399ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
40e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
41e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
42b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
43b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
44b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
45b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
462205254eSKarl Rupp 
479ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
489ae82921SPaul Mullowney   PetscFunctionReturn(0);
499ae82921SPaul Mullowney }
50e057df02SPaul Mullowney 
519ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
529ae82921SPaul Mullowney {
53e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
54e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
55e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
56e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
57e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
58e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
59e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
60e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
61e057df02SPaul Mullowney      against race conditions.
62e057df02SPaul Mullowney 
63e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
649ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
659ae82921SPaul Mullowney   PetscErrorCode ierr;
669ae82921SPaul Mullowney   PetscInt       nt;
679ae82921SPaul Mullowney 
689ae82921SPaul Mullowney   PetscFunctionBegin;
699ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
709ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
71959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
729ae82921SPaul Mullowney   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
739ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
749ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
759ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
769ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
779ae82921SPaul Mullowney   PetscFunctionReturn(0);
789ae82921SPaul Mullowney }
79ca45077fSPaul Mullowney 
80ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
81ca45077fSPaul Mullowney {
82e057df02SPaul Mullowney   /* This multiplication sequence is different sequence
83e057df02SPaul Mullowney      than the CPU version. In particular, the diagonal block
84e057df02SPaul Mullowney      multiplication kernel is launched in one stream. Then,
85e057df02SPaul Mullowney      in a separate stream, the data transfers from DeviceToHost
86e057df02SPaul Mullowney      (with MPI messaging in between), then HostToDevice are
87e057df02SPaul Mullowney      launched. Once the data transfer stream is synchronized,
88e057df02SPaul Mullowney      to ensure messaging is complete, the MatMultAdd kernel
89e057df02SPaul Mullowney      is launched in the original (MatMult) stream to protect
90e057df02SPaul Mullowney      against race conditions.
91e057df02SPaul Mullowney 
92e057df02SPaul Mullowney      This sequence should only be called for GPU computation. */
93ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
94ca45077fSPaul Mullowney   PetscErrorCode ierr;
95ca45077fSPaul Mullowney   PetscInt       nt;
96ca45077fSPaul Mullowney 
97ca45077fSPaul Mullowney   PetscFunctionBegin;
98ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
99ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
100959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
1019b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
102ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1039b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1049b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
105ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
106ca45077fSPaul Mullowney   PetscFunctionReturn(0);
107ca45077fSPaul Mullowney }
1089ae82921SPaul Mullowney 
109e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
110ca45077fSPaul Mullowney {
111ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
112bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
113e057df02SPaul Mullowney 
114ca45077fSPaul Mullowney   PetscFunctionBegin;
115e057df02SPaul Mullowney   switch (op) {
116e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
117e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
118045c96e1SPaul Mullowney     break;
119e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
120e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
121045c96e1SPaul Mullowney     break;
122e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
123e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
124e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
125045c96e1SPaul Mullowney     break;
126e057df02SPaul Mullowney   default:
127e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
128045c96e1SPaul Mullowney   }
129ca45077fSPaul Mullowney   PetscFunctionReturn(0);
130ca45077fSPaul Mullowney }
131e057df02SPaul Mullowney 
1324416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
133e057df02SPaul Mullowney {
134e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
135e057df02SPaul Mullowney   PetscErrorCode           ierr;
136e057df02SPaul Mullowney   PetscBool                flg;
137a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
138a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1395fd66863SKarl Rupp 
140e057df02SPaul Mullowney   PetscFunctionBegin;
141e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
142e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
143e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
144a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
145e057df02SPaul Mullowney     if (flg) {
146e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
147e057df02SPaul Mullowney     }
148e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
149a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
150e057df02SPaul Mullowney     if (flg) {
151e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
152e057df02SPaul Mullowney     }
153e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
154a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
155e057df02SPaul Mullowney     if (flg) {
156e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
157e057df02SPaul Mullowney     }
158e057df02SPaul Mullowney   }
1590af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
160e057df02SPaul Mullowney   PetscFunctionReturn(0);
161e057df02SPaul Mullowney }
162e057df02SPaul Mullowney 
16334d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
16434d6c7a5SJose E. Roman {
16534d6c7a5SJose E. Roman   PetscErrorCode ierr;
16634d6c7a5SJose E. Roman   Mat_MPIAIJ     *mpiaij;
16734d6c7a5SJose E. Roman 
16834d6c7a5SJose E. Roman   PetscFunctionBegin;
16934d6c7a5SJose E. Roman   mpiaij = (Mat_MPIAIJ*)A->data;
17034d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
17134d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
17234d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
17334d6c7a5SJose E. Roman   }
17434d6c7a5SJose E. Roman   PetscFunctionReturn(0);
17534d6c7a5SJose E. Roman }
17634d6c7a5SJose E. Roman 
177bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
178bbf3fe20SPaul Mullowney {
179bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
180bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ*)A->data;
181bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
182b06137fdSPaul Mullowney   cudaError_t        err;
183b06137fdSPaul Mullowney   cusparseStatus_t   stat;
184bbf3fe20SPaul Mullowney 
185bbf3fe20SPaul Mullowney   PetscFunctionBegin;
186bbf3fe20SPaul Mullowney   try {
187b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr);
188b06137fdSPaul Mullowney     ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr);
189c41cb2e2SAlejandro Lamas Daviña     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat);
190c41cb2e2SAlejandro Lamas Daviña     err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
191bbf3fe20SPaul Mullowney     delete cusparseStruct;
192bbf3fe20SPaul Mullowney   } catch(char *ex) {
193bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
194bbf3fe20SPaul Mullowney   }
195bbf3fe20SPaul Mullowney   cusparseStruct = 0;
1962205254eSKarl Rupp 
197bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
198bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
199bbf3fe20SPaul Mullowney }
200ca45077fSPaul Mullowney 
2018cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
2029ae82921SPaul Mullowney {
2039ae82921SPaul Mullowney   PetscErrorCode     ierr;
204bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
205bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct;
206b06137fdSPaul Mullowney   cudaError_t        err;
207b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2089ae82921SPaul Mullowney 
2099ae82921SPaul Mullowney   PetscFunctionBegin;
210bbf3fe20SPaul Mullowney   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
211bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
21234136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
21334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
21434136279SStefano Zampini 
215bbf3fe20SPaul Mullowney   a        = (Mat_MPIAIJ*)A->data;
216bbf3fe20SPaul Mullowney   a->spptr = new Mat_MPIAIJCUSPARSE;
2172205254eSKarl Rupp 
218bbf3fe20SPaul Mullowney   cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
219e057df02SPaul Mullowney   cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
220e057df02SPaul Mullowney   cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
221c41cb2e2SAlejandro Lamas Daviña   stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat);
222c41cb2e2SAlejandro Lamas Daviña   err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err);
2232205254eSKarl Rupp 
22434d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
225bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
226bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
227bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
228bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2292205254eSKarl Rupp 
230bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
231bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",  MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2329ae82921SPaul Mullowney   PetscFunctionReturn(0);
2339ae82921SPaul Mullowney }
2349ae82921SPaul Mullowney 
2353f9c0db1SPaul Mullowney /*@
2363f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
237e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
2383f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
239e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
240e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
241e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
2429ae82921SPaul Mullowney 
243*d083f849SBarry Smith    Collective
244e057df02SPaul Mullowney 
245e057df02SPaul Mullowney    Input Parameters:
246e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
247e057df02SPaul Mullowney .  m - number of rows
248e057df02SPaul Mullowney .  n - number of columns
249e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
250e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
2510298fd71SBarry Smith          (possibly different for each row) or NULL
252e057df02SPaul Mullowney 
253e057df02SPaul Mullowney    Output Parameter:
254e057df02SPaul Mullowney .  A - the matrix
255e057df02SPaul Mullowney 
256e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
257e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
258e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
259e057df02SPaul Mullowney 
260e057df02SPaul Mullowney    Notes:
261e057df02SPaul Mullowney    If nnz is given then nz is ignored
262e057df02SPaul Mullowney 
263e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
264e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
265e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
266e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
267e057df02SPaul Mullowney 
268e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
2690298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
270e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
271e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
272e057df02SPaul Mullowney 
273e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
274e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
275e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
276e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
277e057df02SPaul Mullowney 
278e057df02SPaul Mullowney    Level: intermediate
279e057df02SPaul Mullowney 
280e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
281e057df02SPaul Mullowney @*/
2829ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
2839ae82921SPaul Mullowney {
2849ae82921SPaul Mullowney   PetscErrorCode ierr;
2859ae82921SPaul Mullowney   PetscMPIInt    size;
2869ae82921SPaul Mullowney 
2879ae82921SPaul Mullowney   PetscFunctionBegin;
2889ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
2899ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
2909ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
2919ae82921SPaul Mullowney   if (size > 1) {
2929ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
2939ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
2949ae82921SPaul Mullowney   } else {
2959ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2969ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
2979ae82921SPaul Mullowney   }
2989ae82921SPaul Mullowney   PetscFunctionReturn(0);
2999ae82921SPaul Mullowney }
3009ae82921SPaul Mullowney 
3013ca39a21SBarry Smith /*MC
302e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
303e057df02SPaul Mullowney 
3042692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3052692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3062692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3079ae82921SPaul Mullowney 
3089ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3099ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3109ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3119ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3129ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3139ae82921SPaul Mullowney 
3149ae82921SPaul Mullowney    Options Database Keys:
315e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3168468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3178468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3188468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3199ae82921SPaul Mullowney 
3209ae82921SPaul Mullowney   Level: beginner
3219ae82921SPaul Mullowney 
3228468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3238468deeeSKarl Rupp M
3249ae82921SPaul Mullowney M*/
325