xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 08391a17dcb2d35b867018288e008527b6a3e638)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
107e8381f9SStefano Zampini 
117e8381f9SStefano Zampini struct VecCUDAEquals
127e8381f9SStefano Zampini {
137e8381f9SStefano Zampini   template <typename Tuple>
147e8381f9SStefano Zampini   __host__ __device__
157e8381f9SStefano Zampini   void operator()(Tuple t)
167e8381f9SStefano Zampini   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
227e8381f9SStefano Zampini {
237e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
247e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
257e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
267e8381f9SStefano Zampini   PetscErrorCode     ierr;
277e8381f9SStefano Zampini   cudaError_t        cerr;
287e8381f9SStefano Zampini 
297e8381f9SStefano Zampini   PetscFunctionBegin;
307e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
31e61fc153SStefano Zampini   if (cusp->coo_p && v) {
32*08391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
33*08391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
34*08391a17SStefano Zampini 
35*08391a17SStefano Zampini     if (isCudaMem(v)) {
36*08391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
37*08391a17SStefano Zampini     } else {
38e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
39e61fc153SStefano Zampini       w->assign(v,v+n);
40*08391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
41*08391a17SStefano Zampini       d_v = w->data();
42*08391a17SStefano Zampini     }
43*08391a17SStefano Zampini 
44*08391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
457e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
46*08391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
477e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
48*08391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
497e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
50*08391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
51*08391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
537e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
567e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
607e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
617e8381f9SStefano Zampini   A->num_ass++;
627e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
637e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
647e8381f9SStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_BOTH;
657e8381f9SStefano Zampini   PetscFunctionReturn(0);
667e8381f9SStefano Zampini }
677e8381f9SStefano Zampini 
687e8381f9SStefano Zampini template <typename Tuple>
697e8381f9SStefano Zampini struct IsNotOffDiagT
707e8381f9SStefano Zampini {
717e8381f9SStefano Zampini   PetscInt _cstart,_cend;
727e8381f9SStefano Zampini 
737e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
747e8381f9SStefano Zampini   __host__ __device__
757e8381f9SStefano Zampini   inline bool operator()(Tuple t)
767e8381f9SStefano Zampini   {
777e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
787e8381f9SStefano Zampini   }
797e8381f9SStefano Zampini };
807e8381f9SStefano Zampini 
817e8381f9SStefano Zampini struct IsOffDiag
827e8381f9SStefano Zampini {
837e8381f9SStefano Zampini   PetscInt _cstart,_cend;
847e8381f9SStefano Zampini 
857e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
867e8381f9SStefano Zampini   __host__ __device__
877e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
887e8381f9SStefano Zampini   {
897e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
907e8381f9SStefano Zampini   }
917e8381f9SStefano Zampini };
927e8381f9SStefano Zampini 
937e8381f9SStefano Zampini struct GlobToLoc
947e8381f9SStefano Zampini {
957e8381f9SStefano Zampini   PetscInt _start;
967e8381f9SStefano Zampini 
977e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
987e8381f9SStefano Zampini   __host__ __device__
997e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1007e8381f9SStefano Zampini   {
1017e8381f9SStefano Zampini     return c - _start;
1027e8381f9SStefano Zampini   }
1037e8381f9SStefano Zampini };
1047e8381f9SStefano Zampini 
1057e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1067e8381f9SStefano Zampini {
1077e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1087e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1097e8381f9SStefano Zampini   PetscErrorCode         ierr;
1107e8381f9SStefano Zampini   PetscInt               *jj;
1117e8381f9SStefano Zampini   size_t                 noff = 0;
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1137e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1147e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1157e8381f9SStefano Zampini   cudaError_t            cerr;
1167e8381f9SStefano Zampini 
1177e8381f9SStefano Zampini   PetscFunctionBegin;
1187e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1197e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1227e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1237e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1247e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1257e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1267e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1277e8381f9SStefano Zampini 
1287e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1297e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1307e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1317e8381f9SStefano Zampini   delete cusp->coo_p;
1327e8381f9SStefano Zampini   delete cusp->coo_pw;
1337e8381f9SStefano Zampini   cusp->coo_p = NULL;
1347e8381f9SStefano Zampini   cusp->coo_pw = NULL;
135*08391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1367e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1377e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1387e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1397e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1407e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1417e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1427e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1437e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1447e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1457e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1467e8381f9SStefano Zampini   }
1477e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1487e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1497e8381f9SStefano Zampini 
1507e8381f9SStefano Zampini   /* from global to local */
1517e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1527e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
153*08391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
154*08391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1557e8381f9SStefano Zampini 
1567e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1577e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1587e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1597e8381f9SStefano Zampini   auto o_j = d_j.begin();
160*08391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1617e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1627e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1637e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
164*08391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
165*08391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1667e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1677e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1687e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1697e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1707e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1747e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1757e8381f9SStefano Zampini 
1767e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1837e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1847e8381f9SStefano Zampini 
1857e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1867e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1897e8381f9SStefano Zampini 
1907e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1987e8381f9SStefano Zampini   B->nonzerostate++;
1997e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
2007e8381f9SStefano Zampini 
2017e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2027e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2037e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2047e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2057e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2067e8381f9SStefano Zampini   PetscFunctionReturn(0);
2077e8381f9SStefano Zampini }
2089ae82921SPaul Mullowney 
209ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
210ed502f03SStefano Zampini {
211ed502f03SStefano Zampini   Mat            Ad,Ao;
212ed502f03SStefano Zampini   const PetscInt *cmap;
213ed502f03SStefano Zampini   PetscErrorCode ierr;
214ed502f03SStefano Zampini 
215ed502f03SStefano Zampini   PetscFunctionBegin;
216ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
217ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
218ed502f03SStefano Zampini   if (glob) {
219ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
220ed502f03SStefano Zampini 
221ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
222ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
223ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
224ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
225ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
226ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
227ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
228ed502f03SStefano Zampini   }
229ed502f03SStefano Zampini   PetscFunctionReturn(0);
230ed502f03SStefano Zampini }
231ed502f03SStefano Zampini 
2329ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2339ae82921SPaul Mullowney {
234bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
235bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2369ae82921SPaul Mullowney   PetscErrorCode     ierr;
2379ae82921SPaul Mullowney   PetscInt           i;
2389ae82921SPaul Mullowney 
2399ae82921SPaul Mullowney   PetscFunctionBegin;
2409ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2419ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2427e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2439ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2449ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2459ae82921SPaul Mullowney     }
2469ae82921SPaul Mullowney   }
2477e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2489ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2499ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2509ae82921SPaul Mullowney     }
2519ae82921SPaul Mullowney   }
2529ae82921SPaul Mullowney   if (!B->preallocated) {
253bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
2549ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
255b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2569ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2573bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2589ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
259b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2609ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
2613bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2629ae82921SPaul Mullowney   }
263d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
264d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
265d98d7c49SStefano Zampini   if (b->lvec) {
266d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
267d98d7c49SStefano Zampini   }
2689ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2699ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
270e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
271e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
272b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
273b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
274b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
275b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2762205254eSKarl Rupp 
2779ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2789ae82921SPaul Mullowney   PetscFunctionReturn(0);
2799ae82921SPaul Mullowney }
280e057df02SPaul Mullowney 
2819ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2829ae82921SPaul Mullowney {
2839ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2849ae82921SPaul Mullowney   PetscErrorCode ierr;
2859ae82921SPaul Mullowney   PetscInt       nt;
2869ae82921SPaul Mullowney 
2879ae82921SPaul Mullowney   PetscFunctionBegin;
2889ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
2899ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
2909ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2914d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
2929ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2939ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
2949ae82921SPaul Mullowney   PetscFunctionReturn(0);
2959ae82921SPaul Mullowney }
296ca45077fSPaul Mullowney 
2973fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
2983fa6b06aSMark Adams {
2993fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3003fa6b06aSMark Adams   PetscErrorCode ierr;
3013fa6b06aSMark Adams 
3023fa6b06aSMark Adams   PetscFunctionBegin;
3033fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
3043fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
3053fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
3063fa6b06aSMark Adams     if (d_mat) {
3073fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
3083fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
3093fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
3103fa6b06aSMark Adams       cudaError_t  err;
3113fa6b06aSMark Adams       PetscScalar  *vals;
3123fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
3133fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3143fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
3153fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3163fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
3173fa6b06aSMark Adams     }
3183fa6b06aSMark Adams   }
3193fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3203fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3213fa6b06aSMark Adams 
3223fa6b06aSMark Adams   PetscFunctionReturn(0);
3233fa6b06aSMark Adams }
324fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
325fdc842d1SBarry Smith {
326fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
327fdc842d1SBarry Smith   PetscErrorCode ierr;
328fdc842d1SBarry Smith   PetscInt       nt;
329fdc842d1SBarry Smith 
330fdc842d1SBarry Smith   PetscFunctionBegin;
331fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
332fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
333fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3344d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
335fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
336fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
337fdc842d1SBarry Smith   PetscFunctionReturn(0);
338fdc842d1SBarry Smith }
339fdc842d1SBarry Smith 
340ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
341ca45077fSPaul Mullowney {
342ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
343ca45077fSPaul Mullowney   PetscErrorCode ierr;
344ca45077fSPaul Mullowney   PetscInt       nt;
345ca45077fSPaul Mullowney 
346ca45077fSPaul Mullowney   PetscFunctionBegin;
347ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
348ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3499b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
350ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3519b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3529b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
353ca45077fSPaul Mullowney   PetscFunctionReturn(0);
354ca45077fSPaul Mullowney }
3559ae82921SPaul Mullowney 
356e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
357ca45077fSPaul Mullowney {
358ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
359bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
360e057df02SPaul Mullowney 
361ca45077fSPaul Mullowney   PetscFunctionBegin;
362e057df02SPaul Mullowney   switch (op) {
363e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
364e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
365045c96e1SPaul Mullowney     break;
366e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
367e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
368045c96e1SPaul Mullowney     break;
369e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
370e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
371e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
372045c96e1SPaul Mullowney     break;
373e057df02SPaul Mullowney   default:
374e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
375045c96e1SPaul Mullowney   }
376ca45077fSPaul Mullowney   PetscFunctionReturn(0);
377ca45077fSPaul Mullowney }
378e057df02SPaul Mullowney 
3794416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
380e057df02SPaul Mullowney {
381e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
382e057df02SPaul Mullowney   PetscErrorCode           ierr;
383e057df02SPaul Mullowney   PetscBool                flg;
384a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
385a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3865fd66863SKarl Rupp 
387e057df02SPaul Mullowney   PetscFunctionBegin;
388e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
389e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
390e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
391a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
392e057df02SPaul Mullowney     if (flg) {
393e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
394e057df02SPaul Mullowney     }
395e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
396a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
397e057df02SPaul Mullowney     if (flg) {
398e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
399e057df02SPaul Mullowney     }
400e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
401a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
402e057df02SPaul Mullowney     if (flg) {
403e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
404e057df02SPaul Mullowney     }
405e057df02SPaul Mullowney   }
4060af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
407e057df02SPaul Mullowney   PetscFunctionReturn(0);
408e057df02SPaul Mullowney }
409e057df02SPaul Mullowney 
41034d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
41134d6c7a5SJose E. Roman {
41234d6c7a5SJose E. Roman   PetscErrorCode             ierr;
4133fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
4143fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
4153fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
41634d6c7a5SJose E. Roman   PetscFunctionBegin;
41734d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
41834d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
41934d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
42034d6c7a5SJose E. Roman   }
421a587d139SMark   if (d_mat) {
4223fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
4233fa6b06aSMark Adams   }
4243fa6b06aSMark Adams 
42534d6c7a5SJose E. Roman   PetscFunctionReturn(0);
42634d6c7a5SJose E. Roman }
42734d6c7a5SJose E. Roman 
428bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
429bbf3fe20SPaul Mullowney {
430bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4313fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4323fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
433b06137fdSPaul Mullowney   cudaError_t        err;
434b06137fdSPaul Mullowney   cusparseStatus_t   stat;
435bbf3fe20SPaul Mullowney 
436bbf3fe20SPaul Mullowney   PetscFunctionBegin;
437d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4383fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
4393fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
4403fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
4413fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
4423fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
4433fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
4443fa6b06aSMark Adams     if (jaca->compressedrow.use) {
4453fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
4463fa6b06aSMark Adams     }
4473fa6b06aSMark Adams     if (jacb->compressedrow.use) {
4483fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
4493fa6b06aSMark Adams     }
4503fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
4513fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
4523fa6b06aSMark Adams   }
453bbf3fe20SPaul Mullowney   try {
4543fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4553fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
45657d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
45717403302SKarl Rupp     if (cusparseStruct->stream) {
458c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
45917403302SKarl Rupp     }
4607e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
4617e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
462bbf3fe20SPaul Mullowney     delete cusparseStruct;
463bbf3fe20SPaul Mullowney   } catch(char *ex) {
464bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
465bbf3fe20SPaul Mullowney   }
4663338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
467ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
4687e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
4697e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4703338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
471bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
472bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
473bbf3fe20SPaul Mullowney }
474ca45077fSPaul Mullowney 
4753338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4769ae82921SPaul Mullowney {
4779ae82921SPaul Mullowney   PetscErrorCode     ierr;
478bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
479bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
480b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4813338378cSStefano Zampini   Mat                A;
4829ae82921SPaul Mullowney 
4839ae82921SPaul Mullowney   PetscFunctionBegin;
4843338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4853338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4863338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
4873338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
4883338378cSStefano Zampini   }
4893338378cSStefano Zampini   A = *newmat;
4903338378cSStefano Zampini 
49134136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
49234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
49334136279SStefano Zampini 
494bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
495d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
496d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
497d98d7c49SStefano Zampini   if (a->lvec) {
498d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
499d98d7c49SStefano Zampini   }
500d98d7c49SStefano Zampini 
5013338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
502bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
5032205254eSKarl Rupp 
504bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
505e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
506e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5077e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5087e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
50917403302SKarl Rupp     cusparseStruct->stream              = 0;
51057d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
5113fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
5123338378cSStefano Zampini   }
5132205254eSKarl Rupp 
51434d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
515bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
516fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
517bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
518bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
519bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
5203fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
5212205254eSKarl Rupp 
522bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
523ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5243338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
525bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5267e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5277e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5289ae82921SPaul Mullowney   PetscFunctionReturn(0);
5299ae82921SPaul Mullowney }
5309ae82921SPaul Mullowney 
5313338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5323338378cSStefano Zampini {
5333338378cSStefano Zampini   PetscErrorCode ierr;
5343338378cSStefano Zampini 
5353338378cSStefano Zampini   PetscFunctionBegin;
53605035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
5373338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5383338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5393338378cSStefano Zampini   PetscFunctionReturn(0);
5403338378cSStefano Zampini }
5413338378cSStefano Zampini 
5423f9c0db1SPaul Mullowney /*@
5433f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
544e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5453f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
546e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
547e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
548e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5499ae82921SPaul Mullowney 
550d083f849SBarry Smith    Collective
551e057df02SPaul Mullowney 
552e057df02SPaul Mullowney    Input Parameters:
553e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
554e057df02SPaul Mullowney .  m - number of rows
555e057df02SPaul Mullowney .  n - number of columns
556e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
557e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5580298fd71SBarry Smith          (possibly different for each row) or NULL
559e057df02SPaul Mullowney 
560e057df02SPaul Mullowney    Output Parameter:
561e057df02SPaul Mullowney .  A - the matrix
562e057df02SPaul Mullowney 
563e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
564e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
565e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
566e057df02SPaul Mullowney 
567e057df02SPaul Mullowney    Notes:
568e057df02SPaul Mullowney    If nnz is given then nz is ignored
569e057df02SPaul Mullowney 
570e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
571e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
572e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
573e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
574e057df02SPaul Mullowney 
575e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5760298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
577e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
578e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
579e057df02SPaul Mullowney 
580e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
581e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
582e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
583e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
584e057df02SPaul Mullowney 
585e057df02SPaul Mullowney    Level: intermediate
586e057df02SPaul Mullowney 
587e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
588e057df02SPaul Mullowney @*/
5899ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
5909ae82921SPaul Mullowney {
5919ae82921SPaul Mullowney   PetscErrorCode ierr;
5929ae82921SPaul Mullowney   PetscMPIInt    size;
5939ae82921SPaul Mullowney 
5949ae82921SPaul Mullowney   PetscFunctionBegin;
5959ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
5969ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
597ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
5989ae82921SPaul Mullowney   if (size > 1) {
5999ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
6009ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
6019ae82921SPaul Mullowney   } else {
6029ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6039ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6049ae82921SPaul Mullowney   }
6059ae82921SPaul Mullowney   PetscFunctionReturn(0);
6069ae82921SPaul Mullowney }
6079ae82921SPaul Mullowney 
6083ca39a21SBarry Smith /*MC
609e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
610e057df02SPaul Mullowney 
6112692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6122692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6132692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6149ae82921SPaul Mullowney 
6159ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6169ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6179ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6189ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6199ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6209ae82921SPaul Mullowney 
6219ae82921SPaul Mullowney    Options Database Keys:
622e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6238468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6248468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6258468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6269ae82921SPaul Mullowney 
6279ae82921SPaul Mullowney   Level: beginner
6289ae82921SPaul Mullowney 
6298468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6308468deeeSKarl Rupp M
6319ae82921SPaul Mullowney M*/
6323fa6b06aSMark Adams 
6333fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
6343fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
6353fa6b06aSMark Adams {
6369db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
6379db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
6389db3cbf9SStefano Zampini #else
6393fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
6403fa6b06aSMark Adams   PetscMPIInt                size,rank;
6413fa6b06aSMark Adams   MPI_Comm                   comm;
6423fa6b06aSMark Adams   PetscErrorCode             ierr;
6433fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
6443fa6b06aSMark Adams   PetscScalar                *aa,*ba;
6453fa6b06aSMark Adams 
6463fa6b06aSMark Adams   PetscFunctionBegin;
6473fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
648ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
649ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
6503fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
6513fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
6523fa6b06aSMark Adams     if (size == 1) {
6533fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
6543fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
6553fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
6563fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
6573fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
6583fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
6596b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
6603fa6b06aSMark Adams     } else {
6613fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
6623fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6633fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
6643fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
6653fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
6663fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
6673fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
6683fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
6693fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
6703fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
6713fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
6723fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
6733fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
6743fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
6756b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
6763fa6b06aSMark Adams     }
6773fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
6783fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
6793fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
6803fa6b06aSMark Adams   } else {
6813fa6b06aSMark Adams     *B = NULL;
6823fa6b06aSMark Adams     PetscFunctionReturn(0);
6833fa6b06aSMark Adams   }
6843fa6b06aSMark Adams   // act like MatSetValues because not called on host
6853fa6b06aSMark Adams   if (A->assembled) {
6863fa6b06aSMark Adams     if (A->was_assembled) {
6873fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
6883fa6b06aSMark Adams     }
6893fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
6903fa6b06aSMark Adams   } else {
6913fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
6923fa6b06aSMark Adams   }
6933fa6b06aSMark Adams   if (!*p_d_mat) {
6943fa6b06aSMark Adams     cudaError_t                 err;
6953fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
6963fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
6973fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
6983fa6b06aSMark Adams     // create and copy
6993fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
7003fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7013fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7023fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
7033fa6b06aSMark Adams     if (size == 1) {
7043fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
7053fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
7063fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
7073fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
7083fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
7093fa6b06aSMark Adams     } else {
7103fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
7113fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
7123fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
7133fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
7143fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
7153fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
7163fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
7173fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
7183fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
7193fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
7203fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
7213fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
7223fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
7233fa6b06aSMark Adams       {
7243fa6b06aSMark Adams         PetscInt ii;
7253fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
7263fa6b06aSMark Adams       }
7273fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
7283fa6b06aSMark Adams       // allocate B copy data
7293fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
7303fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
7313fa6b06aSMark Adams       nnz = jacb->i[n];
7323fa6b06aSMark Adams 
7333fa6b06aSMark Adams       if (jacb->compressedrow.use) {
7343fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7353fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7366b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
7373fa6b06aSMark Adams       h_mat.offdiag.j = bj;
7383fa6b06aSMark Adams       h_mat.offdiag.a = ba;
7393fa6b06aSMark Adams 
7403fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
7413fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7423fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
7433fa6b06aSMark Adams       h_mat.offdiag.n = n;
7443fa6b06aSMark Adams     }
7453fa6b06aSMark Adams     // allocate A copy data
7463fa6b06aSMark Adams     nnz = jaca->i[n];
7473fa6b06aSMark Adams     h_mat.diag.n = n;
7483fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
749ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
7503fa6b06aSMark Adams     if (jaca->compressedrow.use) {
7513fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7523fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7533fa6b06aSMark Adams     } else {
7543fa6b06aSMark Adams       h_mat.diag.i = ai;
7553fa6b06aSMark Adams     }
7563fa6b06aSMark Adams     h_mat.diag.j = aj;
7573fa6b06aSMark Adams     h_mat.diag.a = aa;
7583fa6b06aSMark Adams     // copy pointers and metdata to device
7593fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7603fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
7613fa6b06aSMark Adams   } else {
7623fa6b06aSMark Adams     *B = *p_d_mat;
7633fa6b06aSMark Adams   }
7643fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
7653fa6b06aSMark Adams   PetscFunctionReturn(0);
7669db3cbf9SStefano Zampini #endif
7673fa6b06aSMark Adams }
768