xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 6b78a27e8a44285ab9cca1e39a4245b45917708b)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
99ae82921SPaul Mullowney 
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
353bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
369ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
389ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
393bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
409ae82921SPaul Mullowney   }
41d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43d98d7c49SStefano Zampini   if (b->lvec) {
44d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
45d98d7c49SStefano Zampini   }
469ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
542205254eSKarl Rupp 
559ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
569ae82921SPaul Mullowney   PetscFunctionReturn(0);
579ae82921SPaul Mullowney }
58e057df02SPaul Mullowney 
599ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
609ae82921SPaul Mullowney {
619ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
629ae82921SPaul Mullowney   PetscErrorCode ierr;
639ae82921SPaul Mullowney   PetscInt       nt;
649ae82921SPaul Mullowney 
659ae82921SPaul Mullowney   PetscFunctionBegin;
669ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
679ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
68959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
699ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
704d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
719ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
729ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
739ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
749ae82921SPaul Mullowney   PetscFunctionReturn(0);
759ae82921SPaul Mullowney }
76ca45077fSPaul Mullowney 
773fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
783fa6b06aSMark Adams {
793fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
803fa6b06aSMark Adams   PetscErrorCode ierr;
813fa6b06aSMark Adams 
823fa6b06aSMark Adams   PetscFunctionBegin;
833fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
843fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
853fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
863fa6b06aSMark Adams     if (d_mat) {
873fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
883fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
893fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
903fa6b06aSMark Adams       cudaError_t  err;
913fa6b06aSMark Adams       PetscScalar  *vals;
923fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
933fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
943fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
953fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
963fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
973fa6b06aSMark Adams     }
983fa6b06aSMark Adams   }
993fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1003fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1013fa6b06aSMark Adams 
1023fa6b06aSMark Adams   PetscFunctionReturn(0);
1033fa6b06aSMark Adams }
104fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
105fdc842d1SBarry Smith {
106fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
107fdc842d1SBarry Smith   PetscErrorCode ierr;
108fdc842d1SBarry Smith   PetscInt       nt;
109fdc842d1SBarry Smith 
110fdc842d1SBarry Smith   PetscFunctionBegin;
111fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
112fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
113fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
114fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1154d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
116fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
117fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
118fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
119fdc842d1SBarry Smith   PetscFunctionReturn(0);
120fdc842d1SBarry Smith }
121fdc842d1SBarry Smith 
122ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123ca45077fSPaul Mullowney {
124ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125ca45077fSPaul Mullowney   PetscErrorCode ierr;
126ca45077fSPaul Mullowney   PetscInt       nt;
127ca45077fSPaul Mullowney 
128ca45077fSPaul Mullowney   PetscFunctionBegin;
129ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131a3fdcf43SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
1329b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
133ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1349b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1359b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   PetscFunctionReturn(0);
138ca45077fSPaul Mullowney }
1399ae82921SPaul Mullowney 
140e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141ca45077fSPaul Mullowney {
142ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
144e057df02SPaul Mullowney 
145ca45077fSPaul Mullowney   PetscFunctionBegin;
146e057df02SPaul Mullowney   switch (op) {
147e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
148e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
149045c96e1SPaul Mullowney     break;
150e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
151e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
152045c96e1SPaul Mullowney     break;
153e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
154e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
155e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
156045c96e1SPaul Mullowney     break;
157e057df02SPaul Mullowney   default:
158e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159045c96e1SPaul Mullowney   }
160ca45077fSPaul Mullowney   PetscFunctionReturn(0);
161ca45077fSPaul Mullowney }
162e057df02SPaul Mullowney 
1634416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164e057df02SPaul Mullowney {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
166e057df02SPaul Mullowney   PetscErrorCode           ierr;
167e057df02SPaul Mullowney   PetscBool                flg;
168a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1705fd66863SKarl Rupp 
171e057df02SPaul Mullowney   PetscFunctionBegin;
172e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
173e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
174e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176e057df02SPaul Mullowney     if (flg) {
177e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178e057df02SPaul Mullowney     }
179e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181e057df02SPaul Mullowney     if (flg) {
182e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183e057df02SPaul Mullowney     }
184e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186e057df02SPaul Mullowney     if (flg) {
187e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188e057df02SPaul Mullowney     }
189e057df02SPaul Mullowney   }
1900af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
191e057df02SPaul Mullowney   PetscFunctionReturn(0);
192e057df02SPaul Mullowney }
193e057df02SPaul Mullowney 
19434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19534d6c7a5SJose E. Roman {
19634d6c7a5SJose E. Roman   PetscErrorCode             ierr;
1973fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
1983fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
1993fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
2003fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
20134d6c7a5SJose E. Roman   PetscFunctionBegin;
2023fa6b06aSMark Adams   if (d_mat) {
2033fa6b06aSMark Adams     cudaError_t                err;
2043fa6b06aSMark Adams     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2053fa6b06aSMark Adams   }
20634d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20734d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20834d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20934d6c7a5SJose E. Roman   }
2103fa6b06aSMark Adams   if (nnz_state > A->nonzerostate) {
2113fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
2123fa6b06aSMark Adams   }
2133fa6b06aSMark Adams 
21434d6c7a5SJose E. Roman   PetscFunctionReturn(0);
21534d6c7a5SJose E. Roman }
21634d6c7a5SJose E. Roman 
217bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
218bbf3fe20SPaul Mullowney {
219bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
2203fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
2213fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
222b06137fdSPaul Mullowney   cudaError_t        err;
223b06137fdSPaul Mullowney   cusparseStatus_t   stat;
224bbf3fe20SPaul Mullowney 
225bbf3fe20SPaul Mullowney   PetscFunctionBegin;
226d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
2273fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
2283fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
2293fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
2303fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
2313fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2323fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2333fa6b06aSMark Adams     if (jaca->compressedrow.use) {
2343fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2353fa6b06aSMark Adams     }
2363fa6b06aSMark Adams     if (jacb->compressedrow.use) {
2373fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
2383fa6b06aSMark Adams     }
2393fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
2403fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
2413fa6b06aSMark Adams   }
242bbf3fe20SPaul Mullowney   try {
2433fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
2443fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
24557d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
24617403302SKarl Rupp     if (cusparseStruct->stream) {
247c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
24817403302SKarl Rupp     }
249bbf3fe20SPaul Mullowney     delete cusparseStruct;
250bbf3fe20SPaul Mullowney   } catch(char *ex) {
251bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
252bbf3fe20SPaul Mullowney   }
2533338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
2543338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
255bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
256bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
257bbf3fe20SPaul Mullowney }
258ca45077fSPaul Mullowney 
2593338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
2609ae82921SPaul Mullowney {
2619ae82921SPaul Mullowney   PetscErrorCode     ierr;
262bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
263bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
264b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2653338378cSStefano Zampini   Mat                A;
2669ae82921SPaul Mullowney 
2679ae82921SPaul Mullowney   PetscFunctionBegin;
2683338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
2693338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2703338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
2713338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2723338378cSStefano Zampini   }
2733338378cSStefano Zampini   A = *newmat;
2743338378cSStefano Zampini 
27534136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
27634136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
27734136279SStefano Zampini 
278bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
279d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
280d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
281d98d7c49SStefano Zampini   if (a->lvec) {
282d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
283d98d7c49SStefano Zampini   }
284d98d7c49SStefano Zampini 
2853338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
286bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
2872205254eSKarl Rupp 
288bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
289e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
290e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
29117403302SKarl Rupp     cusparseStruct->stream              = 0;
29257d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
2933fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
2943338378cSStefano Zampini   }
2952205254eSKarl Rupp 
29634d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
297bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
298fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
299bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
300bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
301bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
3023fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
3032205254eSKarl Rupp 
304bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3053338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
306bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
3079ae82921SPaul Mullowney   PetscFunctionReturn(0);
3089ae82921SPaul Mullowney }
3099ae82921SPaul Mullowney 
3103338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
3113338378cSStefano Zampini {
3123338378cSStefano Zampini   PetscErrorCode ierr;
3133338378cSStefano Zampini 
3143338378cSStefano Zampini   PetscFunctionBegin;
31505035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
3163338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
3173338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
3183338378cSStefano Zampini   PetscFunctionReturn(0);
3193338378cSStefano Zampini }
3203338378cSStefano Zampini 
3213f9c0db1SPaul Mullowney /*@
3223f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
323e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
3243f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
325e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
326e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
327e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
3289ae82921SPaul Mullowney 
329d083f849SBarry Smith    Collective
330e057df02SPaul Mullowney 
331e057df02SPaul Mullowney    Input Parameters:
332e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
333e057df02SPaul Mullowney .  m - number of rows
334e057df02SPaul Mullowney .  n - number of columns
335e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
336e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
3370298fd71SBarry Smith          (possibly different for each row) or NULL
338e057df02SPaul Mullowney 
339e057df02SPaul Mullowney    Output Parameter:
340e057df02SPaul Mullowney .  A - the matrix
341e057df02SPaul Mullowney 
342e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
343e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
344e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
345e057df02SPaul Mullowney 
346e057df02SPaul Mullowney    Notes:
347e057df02SPaul Mullowney    If nnz is given then nz is ignored
348e057df02SPaul Mullowney 
349e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
350e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
351e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
352e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
353e057df02SPaul Mullowney 
354e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3550298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
356e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
357e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
358e057df02SPaul Mullowney 
359e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
360e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
361e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
362e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
363e057df02SPaul Mullowney 
364e057df02SPaul Mullowney    Level: intermediate
365e057df02SPaul Mullowney 
366e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
367e057df02SPaul Mullowney @*/
3689ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3699ae82921SPaul Mullowney {
3709ae82921SPaul Mullowney   PetscErrorCode ierr;
3719ae82921SPaul Mullowney   PetscMPIInt    size;
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney   PetscFunctionBegin;
3749ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3759ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3769ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3779ae82921SPaul Mullowney   if (size > 1) {
3789ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3799ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3809ae82921SPaul Mullowney   } else {
3819ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3829ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3839ae82921SPaul Mullowney   }
3849ae82921SPaul Mullowney   PetscFunctionReturn(0);
3859ae82921SPaul Mullowney }
3869ae82921SPaul Mullowney 
3873ca39a21SBarry Smith /*MC
388e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
389e057df02SPaul Mullowney 
3902692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3912692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3922692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3959ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3969ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3979ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3989ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney    Options Database Keys:
401e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
4028468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4038468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4048468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney   Level: beginner
4079ae82921SPaul Mullowney 
4088468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
4098468deeeSKarl Rupp M
4109ae82921SPaul Mullowney M*/
4113fa6b06aSMark Adams 
4123fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
4133fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
4143fa6b06aSMark Adams {
4159db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
4169db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
4179db3cbf9SStefano Zampini #else
4183fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
4193fa6b06aSMark Adams   PetscMPIInt                size,rank;
4203fa6b06aSMark Adams   MPI_Comm                   comm;
4213fa6b06aSMark Adams   PetscErrorCode             ierr;
4223fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
4233fa6b06aSMark Adams   PetscScalar                *aa,*ba;
4243fa6b06aSMark Adams 
4253fa6b06aSMark Adams   PetscFunctionBegin;
4263fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
4273fa6b06aSMark Adams   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
4283fa6b06aSMark Adams   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
4293fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
4303fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
4313fa6b06aSMark Adams     if (size == 1) {
4323fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
4333fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
4343fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
4353fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
4363fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
4373fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
438*6b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
4393fa6b06aSMark Adams     } else {
4403fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
4413fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
4423fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
4433fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
4443fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
4453fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
4463fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
4473fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
4483fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
4493fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
4503fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
4513fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
4523fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
4533fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
454*6b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
4553fa6b06aSMark Adams     }
4563fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4573fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
4583fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
4593fa6b06aSMark Adams   } else {
4603fa6b06aSMark Adams     *B = NULL;
4613fa6b06aSMark Adams     PetscFunctionReturn(0);
4623fa6b06aSMark Adams   }
4633fa6b06aSMark Adams   // act like MatSetValues because not called on host
4643fa6b06aSMark Adams   if (A->assembled) {
4653fa6b06aSMark Adams     if (A->was_assembled) {
4663fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
4673fa6b06aSMark Adams     }
4683fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
4693fa6b06aSMark Adams   } else {
4703fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
4713fa6b06aSMark Adams   }
4723fa6b06aSMark Adams   if (!*p_d_mat) {
4733fa6b06aSMark Adams     cudaError_t                 err;
4743fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
4753fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
4763fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
4773fa6b06aSMark Adams     // create and copy
4783fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
4793fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4803fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4813fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
4823fa6b06aSMark Adams     if (size == 1) {
4833fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
4843fa6b06aSMark Adams       h_mat.nonzerostate = A->nonzerostate;
4853fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
4863fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
4873fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
4883fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
4893fa6b06aSMark Adams       h_mat.seq = PETSC_TRUE;
4903fa6b06aSMark Adams     } else {
4913fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
4923fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
4933fa6b06aSMark Adams       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
4943fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
4953fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
4963fa6b06aSMark Adams       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
4973fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
4983fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
4993fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
5003fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
5013fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
5023fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
5033fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
5043fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
5053fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
5063fa6b06aSMark Adams       {
5073fa6b06aSMark Adams         PetscInt ii;
5083fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
5093fa6b06aSMark Adams       }
5103fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
5113fa6b06aSMark Adams       // allocate B copy data
5123fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
5133fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
5143fa6b06aSMark Adams       nnz = jacb->i[n];
5153fa6b06aSMark Adams 
5163fa6b06aSMark Adams       if (jacb->compressedrow.use) {
5173fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5183fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
519*6b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
5203fa6b06aSMark Adams       h_mat.offdiag.j = bj;
5213fa6b06aSMark Adams       h_mat.offdiag.a = ba;
5223fa6b06aSMark Adams 
5233fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
5243fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5253fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
5263fa6b06aSMark Adams       h_mat.offdiag.n = n;
5273fa6b06aSMark Adams     }
5283fa6b06aSMark Adams     // allocate A copy data
5293fa6b06aSMark Adams     nnz = jaca->i[n];
5303fa6b06aSMark Adams     h_mat.diag.n = n;
5313fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
5323fa6b06aSMark Adams     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
5333fa6b06aSMark Adams     if (jaca->compressedrow.use) {
5343fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5353fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5363fa6b06aSMark Adams     } else {
5373fa6b06aSMark Adams       h_mat.diag.i = ai;
5383fa6b06aSMark Adams     }
5393fa6b06aSMark Adams     h_mat.diag.j = aj;
5403fa6b06aSMark Adams     h_mat.diag.a = aa;
5413fa6b06aSMark Adams     // copy pointers and metdata to device
5423fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5433fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
5443fa6b06aSMark Adams   } else {
5453fa6b06aSMark Adams     *B = *p_d_mat;
5463fa6b06aSMark Adams   }
5473fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
5483fa6b06aSMark Adams   PetscFunctionReturn(0);
5499db3cbf9SStefano Zampini #endif
5503fa6b06aSMark Adams }
551