xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 4e84afc0f1a4047b7fb785e275ab9ac0e73c2b80)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
104eb5378fSStefano Zampini #include <petscsf.h>
117e8381f9SStefano Zampini 
127e8381f9SStefano Zampini struct VecCUDAEquals
137e8381f9SStefano Zampini {
147e8381f9SStefano Zampini   template <typename Tuple>
157e8381f9SStefano Zampini   __host__ __device__
167e8381f9SStefano Zampini   void operator()(Tuple t)
177e8381f9SStefano Zampini   {
187e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
197e8381f9SStefano Zampini   }
207e8381f9SStefano Zampini };
217e8381f9SStefano Zampini 
227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
237e8381f9SStefano Zampini {
247e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
257e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
267e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
277e8381f9SStefano Zampini   PetscErrorCode     ierr;
287e8381f9SStefano Zampini   cudaError_t        cerr;
297e8381f9SStefano Zampini 
307e8381f9SStefano Zampini   PetscFunctionBegin;
317e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
5208391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53e61fc153SStefano Zampini     delete w;
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
567e8381f9SStefano Zampini   } else {
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
597e8381f9SStefano Zampini   }
607e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
617e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
627e8381f9SStefano Zampini   A->num_ass++;
637e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
647e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
654eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
667e8381f9SStefano Zampini   PetscFunctionReturn(0);
677e8381f9SStefano Zampini }
687e8381f9SStefano Zampini 
697e8381f9SStefano Zampini template <typename Tuple>
707e8381f9SStefano Zampini struct IsNotOffDiagT
717e8381f9SStefano Zampini {
727e8381f9SStefano Zampini   PetscInt _cstart,_cend;
737e8381f9SStefano Zampini 
747e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
757e8381f9SStefano Zampini   __host__ __device__
767e8381f9SStefano Zampini   inline bool operator()(Tuple t)
777e8381f9SStefano Zampini   {
787e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
797e8381f9SStefano Zampini   }
807e8381f9SStefano Zampini };
817e8381f9SStefano Zampini 
827e8381f9SStefano Zampini struct IsOffDiag
837e8381f9SStefano Zampini {
847e8381f9SStefano Zampini   PetscInt _cstart,_cend;
857e8381f9SStefano Zampini 
867e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
877e8381f9SStefano Zampini   __host__ __device__
887e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
897e8381f9SStefano Zampini   {
907e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
917e8381f9SStefano Zampini   }
927e8381f9SStefano Zampini };
937e8381f9SStefano Zampini 
947e8381f9SStefano Zampini struct GlobToLoc
957e8381f9SStefano Zampini {
967e8381f9SStefano Zampini   PetscInt _start;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
997e8381f9SStefano Zampini   __host__ __device__
1007e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1017e8381f9SStefano Zampini   {
1027e8381f9SStefano Zampini     return c - _start;
1037e8381f9SStefano Zampini   }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1067e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1077e8381f9SStefano Zampini {
1087e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1097e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1107e8381f9SStefano Zampini   PetscErrorCode         ierr;
1117e8381f9SStefano Zampini   PetscInt               *jj;
1127e8381f9SStefano Zampini   size_t                 noff = 0;
1137e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1147e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1157e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1167e8381f9SStefano Zampini   cudaError_t            cerr;
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini   PetscFunctionBegin;
1197e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1237e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1247e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1257e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1267e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1277e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1287e8381f9SStefano Zampini 
1297e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1307e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1317e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1327e8381f9SStefano Zampini   delete cusp->coo_p;
1337e8381f9SStefano Zampini   delete cusp->coo_pw;
1347e8381f9SStefano Zampini   cusp->coo_p = NULL;
1357e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13608391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1377e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1387e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1397e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1407e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1417e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1427e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1437e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1447e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1457e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1467e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1477e8381f9SStefano Zampini   }
1487e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1497e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* from global to local */
1527e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1537e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
15408391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1567e8381f9SStefano Zampini 
1577e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1587e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1597e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1607e8381f9SStefano Zampini   auto o_j = d_j.begin();
16108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1627e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1637e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1647e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
16508391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
16608391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1677e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1687e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1697e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1707e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1757e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1767e8381f9SStefano Zampini 
1777e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1837e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1857e8381f9SStefano Zampini 
1867e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1877e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1907e8381f9SStefano Zampini 
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1997e8381f9SStefano Zampini   B->nonzerostate++;
2007e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
2017e8381f9SStefano Zampini 
2027e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2037e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2047e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2057e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2067e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2077e8381f9SStefano Zampini   PetscFunctionReturn(0);
2087e8381f9SStefano Zampini }
2099ae82921SPaul Mullowney 
210ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211ed502f03SStefano Zampini {
212ed502f03SStefano Zampini   Mat            Ad,Ao;
213ed502f03SStefano Zampini   const PetscInt *cmap;
214ed502f03SStefano Zampini   PetscErrorCode ierr;
215ed502f03SStefano Zampini 
216ed502f03SStefano Zampini   PetscFunctionBegin;
217ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219ed502f03SStefano Zampini   if (glob) {
220ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
221ed502f03SStefano Zampini 
222ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229ed502f03SStefano Zampini   }
230ed502f03SStefano Zampini   PetscFunctionReturn(0);
231ed502f03SStefano Zampini }
232ed502f03SStefano Zampini 
2339ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2349ae82921SPaul Mullowney {
235bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2379ae82921SPaul Mullowney   PetscErrorCode     ierr;
2389ae82921SPaul Mullowney   PetscInt           i;
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney   PetscFunctionBegin;
2419ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2429ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2437e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2449ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2459ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2469ae82921SPaul Mullowney     }
2479ae82921SPaul Mullowney   }
2487e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2499ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2509ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2519ae82921SPaul Mullowney     }
2529ae82921SPaul Mullowney   }
2536a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
2546a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
2556a29ce69SStefano Zampini #else
2566a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
2576a29ce69SStefano Zampini #endif
2586a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
2596a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
2606a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
2616a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2626a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
2636a29ce69SStefano Zampini   if (!b->A) {
2649ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2659ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2663bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2676a29ce69SStefano Zampini   }
2686a29ce69SStefano Zampini   if (!b->B) {
2696a29ce69SStefano Zampini     PetscMPIInt size;
2706a29ce69SStefano Zampini     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
2719ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
2726a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2733bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2749ae82921SPaul Mullowney   }
275d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
276d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2776a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2786a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2799ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2809ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
281e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
282e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
283b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
284b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
285b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
286b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2872205254eSKarl Rupp 
2889ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2899ae82921SPaul Mullowney   PetscFunctionReturn(0);
2909ae82921SPaul Mullowney }
291e057df02SPaul Mullowney 
292e589036eSStefano Zampini /*@
293e589036eSStefano Zampini    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
294e589036eSStefano Zampini 
295e589036eSStefano Zampini    Not collective
296e589036eSStefano Zampini 
297e589036eSStefano Zampini    Input Parameters:
298e589036eSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
299e589036eSStefano Zampini -  gen - the boolean flag
300e589036eSStefano Zampini 
301e589036eSStefano Zampini    Level: intermediate
302e589036eSStefano Zampini 
303e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
304e589036eSStefano Zampini @*/
305e589036eSStefano Zampini PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
306e589036eSStefano Zampini {
307e589036eSStefano Zampini   PetscErrorCode ierr;
308e589036eSStefano Zampini   PetscBool      ismpiaij;
309e589036eSStefano Zampini 
310e589036eSStefano Zampini   PetscFunctionBegin;
311e589036eSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
312e589036eSStefano Zampini   MatCheckPreallocated(A,1);
313e589036eSStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
314e589036eSStefano Zampini   if (ismpiaij) {
315e589036eSStefano Zampini     Mat A_d,A_o;
316e589036eSStefano Zampini 
317e589036eSStefano Zampini     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
318e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
319e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
320e589036eSStefano Zampini   } else {
321e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
322e589036eSStefano Zampini   }
323e589036eSStefano Zampini   PetscFunctionReturn(0);
324e589036eSStefano Zampini }
325e589036eSStefano Zampini 
3269ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3279ae82921SPaul Mullowney {
3289ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
3299ae82921SPaul Mullowney   PetscErrorCode ierr;
3309ae82921SPaul Mullowney   PetscInt       nt;
3319ae82921SPaul Mullowney 
3329ae82921SPaul Mullowney   PetscFunctionBegin;
3339ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
3349ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
3359ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3364d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
3399ae82921SPaul Mullowney   PetscFunctionReturn(0);
3409ae82921SPaul Mullowney }
341ca45077fSPaul Mullowney 
3423fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
3433fa6b06aSMark Adams {
3443fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3453fa6b06aSMark Adams   PetscErrorCode ierr;
3463fa6b06aSMark Adams 
3473fa6b06aSMark Adams   PetscFunctionBegin;
3483fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
3493fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
3503fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
3513fa6b06aSMark Adams     if (d_mat) {
3523fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
3533fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
3543fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
3553fa6b06aSMark Adams       cudaError_t  err;
3563fa6b06aSMark Adams       PetscScalar  *vals;
3573fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
3583fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3593fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
3603fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3613fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
3623fa6b06aSMark Adams     }
3633fa6b06aSMark Adams   }
3643fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3653fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3663fa6b06aSMark Adams 
3673fa6b06aSMark Adams   PetscFunctionReturn(0);
3683fa6b06aSMark Adams }
369fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
370fdc842d1SBarry Smith {
371fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
372fdc842d1SBarry Smith   PetscErrorCode ierr;
373fdc842d1SBarry Smith   PetscInt       nt;
374fdc842d1SBarry Smith 
375fdc842d1SBarry Smith   PetscFunctionBegin;
376fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
377fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
378fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3794d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
380fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
381fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
382fdc842d1SBarry Smith   PetscFunctionReturn(0);
383fdc842d1SBarry Smith }
384fdc842d1SBarry Smith 
385ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
386ca45077fSPaul Mullowney {
387ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
388ca45077fSPaul Mullowney   PetscErrorCode ierr;
389ca45077fSPaul Mullowney   PetscInt       nt;
390ca45077fSPaul Mullowney 
391ca45077fSPaul Mullowney   PetscFunctionBegin;
392ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
393ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3949b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
395ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3969b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3979b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
398ca45077fSPaul Mullowney   PetscFunctionReturn(0);
399ca45077fSPaul Mullowney }
4009ae82921SPaul Mullowney 
401e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
402ca45077fSPaul Mullowney {
403ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
404bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
405e057df02SPaul Mullowney 
406ca45077fSPaul Mullowney   PetscFunctionBegin;
407e057df02SPaul Mullowney   switch (op) {
408e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
409e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
410045c96e1SPaul Mullowney     break;
411e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
412e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
413045c96e1SPaul Mullowney     break;
414e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
415e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
416e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
417045c96e1SPaul Mullowney     break;
418e057df02SPaul Mullowney   default:
419e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
420045c96e1SPaul Mullowney   }
421ca45077fSPaul Mullowney   PetscFunctionReturn(0);
422ca45077fSPaul Mullowney }
423e057df02SPaul Mullowney 
4244416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
425e057df02SPaul Mullowney {
426e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
427e057df02SPaul Mullowney   PetscErrorCode           ierr;
428e057df02SPaul Mullowney   PetscBool                flg;
429a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
430a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
4315fd66863SKarl Rupp 
432e057df02SPaul Mullowney   PetscFunctionBegin;
433e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
434e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
435e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
436a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
437e057df02SPaul Mullowney     if (flg) {
438e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
439e057df02SPaul Mullowney     }
440e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
441a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
442e057df02SPaul Mullowney     if (flg) {
443e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
444e057df02SPaul Mullowney     }
445e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
446a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
447e057df02SPaul Mullowney     if (flg) {
448e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
449e057df02SPaul Mullowney     }
450e057df02SPaul Mullowney   }
4510af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
452e057df02SPaul Mullowney   PetscFunctionReturn(0);
453e057df02SPaul Mullowney }
454e057df02SPaul Mullowney 
45534d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
45634d6c7a5SJose E. Roman {
45734d6c7a5SJose E. Roman   PetscErrorCode             ierr;
4583fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
4593fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
4603fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
46134d6c7a5SJose E. Roman   PetscFunctionBegin;
46234d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
46334d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
46434d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
46534d6c7a5SJose E. Roman   }
466a587d139SMark   if (d_mat) {
4673fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
4683fa6b06aSMark Adams   }
4693fa6b06aSMark Adams 
47034d6c7a5SJose E. Roman   PetscFunctionReturn(0);
47134d6c7a5SJose E. Roman }
47234d6c7a5SJose E. Roman 
473bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
474bbf3fe20SPaul Mullowney {
475bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4763fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4773fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
478b06137fdSPaul Mullowney   cudaError_t        err;
479b06137fdSPaul Mullowney   cusparseStatus_t   stat;
480bbf3fe20SPaul Mullowney 
481bbf3fe20SPaul Mullowney   PetscFunctionBegin;
482d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4833fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
4843fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
4853fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
4863fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
4873fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
4883fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
4893fa6b06aSMark Adams     if (jaca->compressedrow.use) {
4903fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
4913fa6b06aSMark Adams     }
4923fa6b06aSMark Adams     if (jacb->compressedrow.use) {
4933fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
4943fa6b06aSMark Adams     }
4953fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
4963fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
4973fa6b06aSMark Adams   }
498bbf3fe20SPaul Mullowney   try {
4993fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
5003fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
50157d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
50217403302SKarl Rupp     if (cusparseStruct->stream) {
503c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
50417403302SKarl Rupp     }
5057e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
5067e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
507bbf3fe20SPaul Mullowney     delete cusparseStruct;
508bbf3fe20SPaul Mullowney   } catch(char *ex) {
509bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
510bbf3fe20SPaul Mullowney   }
5113338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
512ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
5137e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
5147e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
5153338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
516bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
517bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
518bbf3fe20SPaul Mullowney }
519ca45077fSPaul Mullowney 
5203338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
5219ae82921SPaul Mullowney {
5229ae82921SPaul Mullowney   PetscErrorCode     ierr;
523bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
524bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
525b06137fdSPaul Mullowney   cusparseStatus_t   stat;
5263338378cSStefano Zampini   Mat                A;
5279ae82921SPaul Mullowney 
5289ae82921SPaul Mullowney   PetscFunctionBegin;
5293338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
5303338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
5313338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
5323338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
5333338378cSStefano Zampini   }
5343338378cSStefano Zampini   A = *newmat;
5353338378cSStefano Zampini 
53634136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
53734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
53834136279SStefano Zampini 
539bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
540d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
541d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
542d98d7c49SStefano Zampini   if (a->lvec) {
543d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
544d98d7c49SStefano Zampini   }
545d98d7c49SStefano Zampini 
5463338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
547bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
5482205254eSKarl Rupp 
549bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
550e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
551e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5527e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5537e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
55417403302SKarl Rupp     cusparseStruct->stream              = 0;
55557d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
5563fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
5573338378cSStefano Zampini   }
5582205254eSKarl Rupp 
55934d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
560bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
561fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
562bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
563bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
564bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
5653fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
566*4e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
5672205254eSKarl Rupp 
568bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
569ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5703338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
571bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5727e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5737e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5749ae82921SPaul Mullowney   PetscFunctionReturn(0);
5759ae82921SPaul Mullowney }
5769ae82921SPaul Mullowney 
5773338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5783338378cSStefano Zampini {
5793338378cSStefano Zampini   PetscErrorCode ierr;
5803338378cSStefano Zampini 
5813338378cSStefano Zampini   PetscFunctionBegin;
58205035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
5833338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5843338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5853338378cSStefano Zampini   PetscFunctionReturn(0);
5863338378cSStefano Zampini }
5873338378cSStefano Zampini 
5883f9c0db1SPaul Mullowney /*@
5893f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
590e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5913f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
592e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
593e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
594e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5959ae82921SPaul Mullowney 
596d083f849SBarry Smith    Collective
597e057df02SPaul Mullowney 
598e057df02SPaul Mullowney    Input Parameters:
599e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
600e057df02SPaul Mullowney .  m - number of rows
601e057df02SPaul Mullowney .  n - number of columns
602e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
603e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
6040298fd71SBarry Smith          (possibly different for each row) or NULL
605e057df02SPaul Mullowney 
606e057df02SPaul Mullowney    Output Parameter:
607e057df02SPaul Mullowney .  A - the matrix
608e057df02SPaul Mullowney 
609e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
610e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
611e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
612e057df02SPaul Mullowney 
613e057df02SPaul Mullowney    Notes:
614e057df02SPaul Mullowney    If nnz is given then nz is ignored
615e057df02SPaul Mullowney 
616e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
617e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
618e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
619e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
620e057df02SPaul Mullowney 
621e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
6220298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
623e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
624e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
625e057df02SPaul Mullowney 
626e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
627e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
628e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
629e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
630e057df02SPaul Mullowney 
631e057df02SPaul Mullowney    Level: intermediate
632e057df02SPaul Mullowney 
633e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
634e057df02SPaul Mullowney @*/
6359ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
6369ae82921SPaul Mullowney {
6379ae82921SPaul Mullowney   PetscErrorCode ierr;
6389ae82921SPaul Mullowney   PetscMPIInt    size;
6399ae82921SPaul Mullowney 
6409ae82921SPaul Mullowney   PetscFunctionBegin;
6419ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6429ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
643ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
6449ae82921SPaul Mullowney   if (size > 1) {
6459ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
6469ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
6479ae82921SPaul Mullowney   } else {
6489ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6499ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6509ae82921SPaul Mullowney   }
6519ae82921SPaul Mullowney   PetscFunctionReturn(0);
6529ae82921SPaul Mullowney }
6539ae82921SPaul Mullowney 
6543ca39a21SBarry Smith /*MC
655e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
656e057df02SPaul Mullowney 
6572692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6582692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6592692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6609ae82921SPaul Mullowney 
6619ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6629ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6639ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6649ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6659ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6669ae82921SPaul Mullowney 
6679ae82921SPaul Mullowney    Options Database Keys:
668e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6698468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6708468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6718468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6729ae82921SPaul Mullowney 
6739ae82921SPaul Mullowney   Level: beginner
6749ae82921SPaul Mullowney 
6758468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6768468deeeSKarl Rupp M
6779ae82921SPaul Mullowney M*/
6783fa6b06aSMark Adams 
6793fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
6803fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
6813fa6b06aSMark Adams {
6829db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
6839db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
6849db3cbf9SStefano Zampini #else
6853fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
6863fa6b06aSMark Adams   PetscMPIInt                size,rank;
6873fa6b06aSMark Adams   MPI_Comm                   comm;
6883fa6b06aSMark Adams   PetscErrorCode             ierr;
6893fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
6903fa6b06aSMark Adams   PetscScalar                *aa,*ba;
6913fa6b06aSMark Adams 
6923fa6b06aSMark Adams   PetscFunctionBegin;
6933fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
694ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
695ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
6963fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
6973fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
6983fa6b06aSMark Adams     if (size == 1) {
6993fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
7003fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
7013fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
7023fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
7033fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
7043fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
7056b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
7063fa6b06aSMark Adams     } else {
7073fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
7083fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
7093fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
7103fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
7113fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
7123fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
7133fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
7143fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
7153fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
7163fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
7173fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
7183fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
7193fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
7203fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
7216b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
7223fa6b06aSMark Adams     }
7233fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
7243fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
7253fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
7263fa6b06aSMark Adams   } else {
7273fa6b06aSMark Adams     *B = NULL;
7283fa6b06aSMark Adams     PetscFunctionReturn(0);
7293fa6b06aSMark Adams   }
7303fa6b06aSMark Adams   // act like MatSetValues because not called on host
7313fa6b06aSMark Adams   if (A->assembled) {
7323fa6b06aSMark Adams     if (A->was_assembled) {
7333fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
7343fa6b06aSMark Adams     }
7353fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
7363fa6b06aSMark Adams   } else {
7373fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
7383fa6b06aSMark Adams   }
7393fa6b06aSMark Adams   if (!*p_d_mat) {
7403fa6b06aSMark Adams     cudaError_t                 err;
7413fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
7423fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
7433fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
7443fa6b06aSMark Adams     // create and copy
7453fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
7463fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7473fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7483fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
7493fa6b06aSMark Adams     if (size == 1) {
7503fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
7513fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
7523fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
7533fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
7543fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
7553fa6b06aSMark Adams     } else {
7563fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
7573fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
7583fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
7593fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
7603fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
7613fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
7623fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
7633fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
7643fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
7653fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
7663fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
7673fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
7683fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
7693fa6b06aSMark Adams       {
7703fa6b06aSMark Adams         PetscInt ii;
7713fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
7723fa6b06aSMark Adams       }
7733fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
7743fa6b06aSMark Adams       // allocate B copy data
7753fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
7763fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
7773fa6b06aSMark Adams       nnz = jacb->i[n];
7783fa6b06aSMark Adams 
7793fa6b06aSMark Adams       if (jacb->compressedrow.use) {
7803fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7813fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7826b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
7833fa6b06aSMark Adams       h_mat.offdiag.j = bj;
7843fa6b06aSMark Adams       h_mat.offdiag.a = ba;
7853fa6b06aSMark Adams 
7863fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
7873fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7883fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
7893fa6b06aSMark Adams       h_mat.offdiag.n = n;
7903fa6b06aSMark Adams     }
7913fa6b06aSMark Adams     // allocate A copy data
7923fa6b06aSMark Adams     nnz = jaca->i[n];
7933fa6b06aSMark Adams     h_mat.diag.n = n;
7943fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
795ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
7963fa6b06aSMark Adams     if (jaca->compressedrow.use) {
7973fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7983fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7993fa6b06aSMark Adams     } else {
8003fa6b06aSMark Adams       h_mat.diag.i = ai;
8013fa6b06aSMark Adams     }
8023fa6b06aSMark Adams     h_mat.diag.j = aj;
8033fa6b06aSMark Adams     h_mat.diag.a = aa;
8043fa6b06aSMark Adams     // copy pointers and metdata to device
8053fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
8063fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
8073fa6b06aSMark Adams   } else {
8083fa6b06aSMark Adams     *B = *p_d_mat;
8093fa6b06aSMark Adams   }
8103fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
8113fa6b06aSMark Adams   PetscFunctionReturn(0);
8129db3cbf9SStefano Zampini #endif
8133fa6b06aSMark Adams }
814