xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision d98d7c49b8eb479eaa6f6b6775e161f1f4109d75)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
99ae82921SPaul Mullowney 
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
353bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
369ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
37b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
389ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
393bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
409ae82921SPaul Mullowney   }
41*d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
42*d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
43*d98d7c49SStefano Zampini   if (b->lvec) {
44*d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
45*d98d7c49SStefano Zampini   }
469ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
479ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
48e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
49e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
51b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
52b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
53b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
542205254eSKarl Rupp 
559ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
569ae82921SPaul Mullowney   PetscFunctionReturn(0);
579ae82921SPaul Mullowney }
58e057df02SPaul Mullowney 
599ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
609ae82921SPaul Mullowney {
619ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
629ae82921SPaul Mullowney   PetscErrorCode ierr;
639ae82921SPaul Mullowney   PetscInt       nt;
649ae82921SPaul Mullowney 
659ae82921SPaul Mullowney   PetscFunctionBegin;
669ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
679ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
68959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
699ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
704d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
719ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
729ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
739ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
749ae82921SPaul Mullowney   PetscFunctionReturn(0);
759ae82921SPaul Mullowney }
76ca45077fSPaul Mullowney 
773fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
783fa6b06aSMark Adams {
793fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
803fa6b06aSMark Adams   PetscErrorCode ierr;
813fa6b06aSMark Adams 
823fa6b06aSMark Adams   PetscFunctionBegin;
833fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
843fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
853fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
863fa6b06aSMark Adams     if (d_mat) {
873fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
883fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
893fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
903fa6b06aSMark Adams       cudaError_t  err;
913fa6b06aSMark Adams       PetscScalar  *vals;
923fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
933fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
943fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
953fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
963fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
973fa6b06aSMark Adams     }
983fa6b06aSMark Adams   }
993fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
1003fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
1013fa6b06aSMark Adams 
1023fa6b06aSMark Adams   PetscFunctionReturn(0);
1033fa6b06aSMark Adams }
104fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
105fdc842d1SBarry Smith {
106fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
107fdc842d1SBarry Smith   PetscErrorCode ierr;
108fdc842d1SBarry Smith   PetscInt       nt;
109fdc842d1SBarry Smith 
110fdc842d1SBarry Smith   PetscFunctionBegin;
111fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
112fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
113fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
114fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1154d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
116fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
117fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
118fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
119fdc842d1SBarry Smith   PetscFunctionReturn(0);
120fdc842d1SBarry Smith }
121fdc842d1SBarry Smith 
122ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
123ca45077fSPaul Mullowney {
124ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
125ca45077fSPaul Mullowney   PetscErrorCode ierr;
126ca45077fSPaul Mullowney   PetscInt       nt;
127ca45077fSPaul Mullowney 
128ca45077fSPaul Mullowney   PetscFunctionBegin;
129ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
130ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
131a3fdcf43SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
1329b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
133ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1349b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1359b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
136ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
137ca45077fSPaul Mullowney   PetscFunctionReturn(0);
138ca45077fSPaul Mullowney }
1399ae82921SPaul Mullowney 
140e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
141ca45077fSPaul Mullowney {
142ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
143bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
144e057df02SPaul Mullowney 
145ca45077fSPaul Mullowney   PetscFunctionBegin;
146e057df02SPaul Mullowney   switch (op) {
147e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
148e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
149045c96e1SPaul Mullowney     break;
150e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
151e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
152045c96e1SPaul Mullowney     break;
153e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
154e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
155e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
156045c96e1SPaul Mullowney     break;
157e057df02SPaul Mullowney   default:
158e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
159045c96e1SPaul Mullowney   }
160ca45077fSPaul Mullowney   PetscFunctionReturn(0);
161ca45077fSPaul Mullowney }
162e057df02SPaul Mullowney 
1634416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
164e057df02SPaul Mullowney {
165e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
166e057df02SPaul Mullowney   PetscErrorCode           ierr;
167e057df02SPaul Mullowney   PetscBool                flg;
168a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
169a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1705fd66863SKarl Rupp 
171e057df02SPaul Mullowney   PetscFunctionBegin;
172e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
173e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
174e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
175a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
176e057df02SPaul Mullowney     if (flg) {
177e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
178e057df02SPaul Mullowney     }
179e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
180a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
181e057df02SPaul Mullowney     if (flg) {
182e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
183e057df02SPaul Mullowney     }
184e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
185a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
186e057df02SPaul Mullowney     if (flg) {
187e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
188e057df02SPaul Mullowney     }
189e057df02SPaul Mullowney   }
1900af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
191e057df02SPaul Mullowney   PetscFunctionReturn(0);
192e057df02SPaul Mullowney }
193e057df02SPaul Mullowney 
19434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19534d6c7a5SJose E. Roman {
19634d6c7a5SJose E. Roman   PetscErrorCode             ierr;
1973fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
1983fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
1993fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
2003fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
20134d6c7a5SJose E. Roman   PetscFunctionBegin;
2023fa6b06aSMark Adams   if (d_mat) {
2033fa6b06aSMark Adams     cudaError_t                err;
2043fa6b06aSMark Adams     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2053fa6b06aSMark Adams   }
20634d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20734d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20834d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20934d6c7a5SJose E. Roman   }
2103fa6b06aSMark Adams   if (nnz_state > A->nonzerostate) {
2113fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
2123fa6b06aSMark Adams   }
2133fa6b06aSMark Adams 
21434d6c7a5SJose E. Roman   PetscFunctionReturn(0);
21534d6c7a5SJose E. Roman }
21634d6c7a5SJose E. Roman 
217bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
218bbf3fe20SPaul Mullowney {
219bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
2203fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
2213fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
222b06137fdSPaul Mullowney   cudaError_t        err;
223b06137fdSPaul Mullowney   cusparseStatus_t   stat;
224bbf3fe20SPaul Mullowney 
225bbf3fe20SPaul Mullowney   PetscFunctionBegin;
226*d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
2273fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
2283fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
2293fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
2303fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
2313fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2323fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2333fa6b06aSMark Adams     if (jaca->compressedrow.use) {
2343fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2353fa6b06aSMark Adams     }
2363fa6b06aSMark Adams     if (jacb->compressedrow.use) {
2373fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
2383fa6b06aSMark Adams     }
2393fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
2403fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
2413fa6b06aSMark Adams   }
242bbf3fe20SPaul Mullowney   try {
2433fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
2443fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
24557d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
24617403302SKarl Rupp     if (cusparseStruct->stream) {
247c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
24817403302SKarl Rupp     }
249bbf3fe20SPaul Mullowney     delete cusparseStruct;
250bbf3fe20SPaul Mullowney   } catch(char *ex) {
251bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
252bbf3fe20SPaul Mullowney   }
2533338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
2543338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
255bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
256bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
257bbf3fe20SPaul Mullowney }
258ca45077fSPaul Mullowney 
2593338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
2609ae82921SPaul Mullowney {
2619ae82921SPaul Mullowney   PetscErrorCode     ierr;
262bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
263bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
264b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2653338378cSStefano Zampini   Mat                A;
2669ae82921SPaul Mullowney 
2679ae82921SPaul Mullowney   PetscFunctionBegin;
2683338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
2693338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2703338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
2713338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2723338378cSStefano Zampini   }
2733338378cSStefano Zampini   A = *newmat;
2743338378cSStefano Zampini 
27534136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
27634136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
27734136279SStefano Zampini 
278bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
279*d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
280*d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
281*d98d7c49SStefano Zampini   if (a->lvec) {
282*d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
283*d98d7c49SStefano Zampini   }
284*d98d7c49SStefano Zampini 
2853338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
286bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
2872205254eSKarl Rupp 
288bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
289e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
290e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
29117403302SKarl Rupp     cusparseStruct->stream              = 0;
29257d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
2933fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
2943338378cSStefano Zampini   }
2952205254eSKarl Rupp 
29634d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
297bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
298fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
299bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
300bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
301bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
3023fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
3032205254eSKarl Rupp 
304bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3053338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
306bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
3079ae82921SPaul Mullowney   PetscFunctionReturn(0);
3089ae82921SPaul Mullowney }
3099ae82921SPaul Mullowney 
3103338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
3113338378cSStefano Zampini {
3123338378cSStefano Zampini   PetscErrorCode ierr;
3133338378cSStefano Zampini 
3143338378cSStefano Zampini   PetscFunctionBegin;
31505035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
3163338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
3173338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
3183338378cSStefano Zampini   PetscFunctionReturn(0);
3193338378cSStefano Zampini }
3203338378cSStefano Zampini 
3213f9c0db1SPaul Mullowney /*@
3223f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
323e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
3243f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
325e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
326e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
327e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
3289ae82921SPaul Mullowney 
329d083f849SBarry Smith    Collective
330e057df02SPaul Mullowney 
331e057df02SPaul Mullowney    Input Parameters:
332e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
333e057df02SPaul Mullowney .  m - number of rows
334e057df02SPaul Mullowney .  n - number of columns
335e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
336e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
3370298fd71SBarry Smith          (possibly different for each row) or NULL
338e057df02SPaul Mullowney 
339e057df02SPaul Mullowney    Output Parameter:
340e057df02SPaul Mullowney .  A - the matrix
341e057df02SPaul Mullowney 
342e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
343e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
344e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
345e057df02SPaul Mullowney 
346e057df02SPaul Mullowney    Notes:
347e057df02SPaul Mullowney    If nnz is given then nz is ignored
348e057df02SPaul Mullowney 
349e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
350e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
351e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
352e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
353e057df02SPaul Mullowney 
354e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3550298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
356e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
357e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
358e057df02SPaul Mullowney 
359e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
360e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
361e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
362e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
363e057df02SPaul Mullowney 
364e057df02SPaul Mullowney    Level: intermediate
365e057df02SPaul Mullowney 
366e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
367e057df02SPaul Mullowney @*/
3689ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3699ae82921SPaul Mullowney {
3709ae82921SPaul Mullowney   PetscErrorCode ierr;
3719ae82921SPaul Mullowney   PetscMPIInt    size;
3729ae82921SPaul Mullowney 
3739ae82921SPaul Mullowney   PetscFunctionBegin;
3749ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3759ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3769ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3779ae82921SPaul Mullowney   if (size > 1) {
3789ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3799ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3809ae82921SPaul Mullowney   } else {
3819ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3829ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3839ae82921SPaul Mullowney   }
3849ae82921SPaul Mullowney   PetscFunctionReturn(0);
3859ae82921SPaul Mullowney }
3869ae82921SPaul Mullowney 
3873ca39a21SBarry Smith /*MC
388e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
389e057df02SPaul Mullowney 
3902692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3912692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3922692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3959ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3969ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3979ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3989ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3999ae82921SPaul Mullowney 
4009ae82921SPaul Mullowney    Options Database Keys:
401e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
4028468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4038468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4048468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney   Level: beginner
4079ae82921SPaul Mullowney 
4088468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
4098468deeeSKarl Rupp M
4109ae82921SPaul Mullowney M*/
4113fa6b06aSMark Adams 
4123fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
4133fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
4143fa6b06aSMark Adams {
4159db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
4169db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
4179db3cbf9SStefano Zampini #else
4183fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
4193fa6b06aSMark Adams   PetscMPIInt                size,rank;
4203fa6b06aSMark Adams   MPI_Comm                   comm;
4213fa6b06aSMark Adams   PetscErrorCode             ierr;
4223fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
4233fa6b06aSMark Adams   PetscScalar                *aa,*ba;
4243fa6b06aSMark Adams 
4253fa6b06aSMark Adams   PetscFunctionBegin;
4263fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
4273fa6b06aSMark Adams   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
4283fa6b06aSMark Adams   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
4293fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
4303fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
4313fa6b06aSMark Adams     if (size == 1) {
4323fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
4333fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
4343fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
4353fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
4363fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstruct->mat;
4373fa6b06aSMark Adams 	bi = bj = NULL; ba = NULL;
4383fa6b06aSMark Adams       } else {
4393fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
4403fa6b06aSMark Adams       }
4413fa6b06aSMark Adams     } else {
4423fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
4433fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
4443fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
4453fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
4463fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
4473fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
4483fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
4493fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
4503fa6b06aSMark Adams 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
4513fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstructA->mat;
4523fa6b06aSMark Adams 	matrixB = (CsrMatrix*)matstructB->mat;
4533fa6b06aSMark Adams 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
4543fa6b06aSMark Adams 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
4553fa6b06aSMark Adams 	ba = thrust::raw_pointer_cast(matrixB->values->data());
4563fa6b06aSMark Adams 	if (rank==-1) {
4573fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
4583fa6b06aSMark Adams 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
4593fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
4603fa6b06aSMark Adams 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
4613fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
4623fa6b06aSMark Adams 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
4633fa6b06aSMark Adams 	}
4643fa6b06aSMark Adams       } else {
4653fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
4663fa6b06aSMark Adams       }
4673fa6b06aSMark Adams     }
4683fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4693fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
4703fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
4713fa6b06aSMark Adams   } else {
4723fa6b06aSMark Adams     *B = NULL;
4733fa6b06aSMark Adams     PetscFunctionReturn(0);
4743fa6b06aSMark Adams   }
4753fa6b06aSMark Adams   // act like MatSetValues because not called on host
4763fa6b06aSMark Adams   if (A->assembled) {
4773fa6b06aSMark Adams     if (A->was_assembled) {
4783fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
4793fa6b06aSMark Adams     }
4803fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
4813fa6b06aSMark Adams   } else {
4823fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
4833fa6b06aSMark Adams   }
4843fa6b06aSMark Adams   if (!*p_d_mat) {
4853fa6b06aSMark Adams     cudaError_t                 err;
4863fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
4873fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
4883fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
4893fa6b06aSMark Adams     // create and copy
4903fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
4913fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4923fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4933fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
4943fa6b06aSMark Adams     if (size == 1) {
4953fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
4963fa6b06aSMark Adams       h_mat.nonzerostate = A->nonzerostate;
4973fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
4983fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
4993fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
5003fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
5013fa6b06aSMark Adams       h_mat.seq = PETSC_TRUE;
5023fa6b06aSMark Adams     } else {
5033fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
5043fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
5053fa6b06aSMark Adams       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
5063fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
5073fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
5083fa6b06aSMark Adams       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
5093fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
5103fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
5113fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
5123fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
5133fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
5143fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
5153fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
5163fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
5173fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
5183fa6b06aSMark Adams       {
5193fa6b06aSMark Adams 	PetscInt ii;
5203fa6b06aSMark Adams 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
5213fa6b06aSMark Adams       }
5223fa6b06aSMark Adams       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
5233fa6b06aSMark Adams       // allocate B copy data
5243fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
5253fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
5263fa6b06aSMark Adams       nnz = jacb->i[n];
5273fa6b06aSMark Adams 
5283fa6b06aSMark Adams       if (jacb->compressedrow.use) {
5293fa6b06aSMark Adams 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5303fa6b06aSMark Adams 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5313fa6b06aSMark Adams       } else {
5323fa6b06aSMark Adams 	h_mat.offdiag.i = bi;
5333fa6b06aSMark Adams       }
5343fa6b06aSMark Adams       h_mat.offdiag.j = bj;
5353fa6b06aSMark Adams       h_mat.offdiag.a = ba;
5363fa6b06aSMark Adams 
5373fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
5383fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5393fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
5403fa6b06aSMark Adams       h_mat.offdiag.n = n;
5413fa6b06aSMark Adams     }
5423fa6b06aSMark Adams     // allocate A copy data
5433fa6b06aSMark Adams     nnz = jaca->i[n];
5443fa6b06aSMark Adams     h_mat.diag.n = n;
5453fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
5463fa6b06aSMark Adams     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
5473fa6b06aSMark Adams     if (jaca->compressedrow.use) {
5483fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5493fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5503fa6b06aSMark Adams     } else {
5513fa6b06aSMark Adams       h_mat.diag.i = ai;
5523fa6b06aSMark Adams     }
5533fa6b06aSMark Adams     h_mat.diag.j = aj;
5543fa6b06aSMark Adams     h_mat.diag.a = aa;
5553fa6b06aSMark Adams     // copy pointers and metdata to device
5563fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5573fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
5583fa6b06aSMark Adams   } else {
5593fa6b06aSMark Adams     *B = *p_d_mat;
5603fa6b06aSMark Adams   }
5613fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
5623fa6b06aSMark Adams   PetscFunctionReturn(0);
5639db3cbf9SStefano Zampini #endif
5643fa6b06aSMark Adams }
565