xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 49b994a94b63d776747762f86beb32ab8d88d03f)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
94eb5378fSStefano Zampini #include <petscsf.h>
107e8381f9SStefano Zampini 
117e8381f9SStefano Zampini struct VecCUDAEquals
127e8381f9SStefano Zampini {
137e8381f9SStefano Zampini   template <typename Tuple>
147e8381f9SStefano Zampini   __host__ __device__
157e8381f9SStefano Zampini   void operator()(Tuple t)
167e8381f9SStefano Zampini   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
227e8381f9SStefano Zampini {
237e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
247e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
257e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
267e8381f9SStefano Zampini   PetscErrorCode     ierr;
277e8381f9SStefano Zampini 
287e8381f9SStefano Zampini   PetscFunctionBegin;
29e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3008391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3108391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3208391a17SStefano Zampini 
3308391a17SStefano Zampini     if (isCudaMem(v)) {
3408391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3508391a17SStefano Zampini     } else {
36e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
37e61fc153SStefano Zampini       w->assign(v,v+n);
3808391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
3908391a17SStefano Zampini       d_v = w->data();
4008391a17SStefano Zampini     }
4108391a17SStefano Zampini 
4208391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
437e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4408391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
457e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4608391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
477e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
4808391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
49e61fc153SStefano Zampini     delete w;
507e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
517e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
527e8381f9SStefano Zampini   } else {
537e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   }
567e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
577e8381f9SStefano Zampini   A->num_ass++;
587e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
597e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
604eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
617e8381f9SStefano Zampini   PetscFunctionReturn(0);
627e8381f9SStefano Zampini }
637e8381f9SStefano Zampini 
647e8381f9SStefano Zampini template <typename Tuple>
657e8381f9SStefano Zampini struct IsNotOffDiagT
667e8381f9SStefano Zampini {
677e8381f9SStefano Zampini   PetscInt _cstart,_cend;
687e8381f9SStefano Zampini 
697e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
707e8381f9SStefano Zampini   __host__ __device__
717e8381f9SStefano Zampini   inline bool operator()(Tuple t)
727e8381f9SStefano Zampini   {
737e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
747e8381f9SStefano Zampini   }
757e8381f9SStefano Zampini };
767e8381f9SStefano Zampini 
777e8381f9SStefano Zampini struct IsOffDiag
787e8381f9SStefano Zampini {
797e8381f9SStefano Zampini   PetscInt _cstart,_cend;
807e8381f9SStefano Zampini 
817e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
827e8381f9SStefano Zampini   __host__ __device__
837e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
847e8381f9SStefano Zampini   {
857e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
867e8381f9SStefano Zampini   }
877e8381f9SStefano Zampini };
887e8381f9SStefano Zampini 
897e8381f9SStefano Zampini struct GlobToLoc
907e8381f9SStefano Zampini {
917e8381f9SStefano Zampini   PetscInt _start;
927e8381f9SStefano Zampini 
937e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
947e8381f9SStefano Zampini   __host__ __device__
957e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
967e8381f9SStefano Zampini   {
977e8381f9SStefano Zampini     return c - _start;
987e8381f9SStefano Zampini   }
997e8381f9SStefano Zampini };
1007e8381f9SStefano Zampini 
1017e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1027e8381f9SStefano Zampini {
1037e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1047e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1057e8381f9SStefano Zampini   PetscErrorCode         ierr;
1067e8381f9SStefano Zampini   PetscInt               *jj;
1077e8381f9SStefano Zampini   size_t                 noff = 0;
108ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1097e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1107e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1117e8381f9SStefano Zampini   cudaError_t            cerr;
1127e8381f9SStefano Zampini 
1137e8381f9SStefano Zampini   PetscFunctionBegin;
1147e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1157e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1167e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1177e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1187e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1197e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1227e8381f9SStefano Zampini 
1237e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1247e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1257e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1267e8381f9SStefano Zampini   delete cusp->coo_p;
1277e8381f9SStefano Zampini   delete cusp->coo_pw;
1287e8381f9SStefano Zampini   cusp->coo_p = NULL;
1297e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13008391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1317e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1327e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1337e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1347e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1357e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1367e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1377e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1387e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1397e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1407e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1417e8381f9SStefano Zampini   }
1427e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1437e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1447e8381f9SStefano Zampini 
1457e8381f9SStefano Zampini   /* from global to local */
1467e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1477e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
14808391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1497e8381f9SStefano Zampini 
1507e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
151ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1527e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1537e8381f9SStefano Zampini   auto o_j = d_j.begin();
15408391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
155ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1567e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
157ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
15808391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1597e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
160ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1617e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1627e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1637e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1667e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1677e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1687e8381f9SStefano Zampini 
1697e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1707e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1777e8381f9SStefano Zampini 
1787e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1797e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1827e8381f9SStefano Zampini 
1837e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
187a0e72f99SJunchao Zhang   /*
1887e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
190a0e72f99SJunchao Zhang   */
1917e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1937e8381f9SStefano Zampini   B->nonzerostate++;
1947e8381f9SStefano Zampini 
1957e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1987e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
1997e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2007e8381f9SStefano Zampini   PetscFunctionReturn(0);
2017e8381f9SStefano Zampini }
2029ae82921SPaul Mullowney 
203ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
204ed502f03SStefano Zampini {
205ed502f03SStefano Zampini   Mat            Ad,Ao;
206ed502f03SStefano Zampini   const PetscInt *cmap;
207ed502f03SStefano Zampini   PetscErrorCode ierr;
208ed502f03SStefano Zampini 
209ed502f03SStefano Zampini   PetscFunctionBegin;
210ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
211ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
212ed502f03SStefano Zampini   if (glob) {
213ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
214ed502f03SStefano Zampini 
215ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
216ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
217ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
218ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
219ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
220ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
221ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
222ed502f03SStefano Zampini   }
223ed502f03SStefano Zampini   PetscFunctionReturn(0);
224ed502f03SStefano Zampini }
225ed502f03SStefano Zampini 
2269ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2279ae82921SPaul Mullowney {
228bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
229bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2309ae82921SPaul Mullowney   PetscErrorCode     ierr;
2319ae82921SPaul Mullowney   PetscInt           i;
2329ae82921SPaul Mullowney 
2339ae82921SPaul Mullowney   PetscFunctionBegin;
2349ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2359ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2367e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2379ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2389ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2399ae82921SPaul Mullowney     }
2409ae82921SPaul Mullowney   }
2417e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2429ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2439ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2449ae82921SPaul Mullowney     }
2459ae82921SPaul Mullowney   }
2466a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
2476a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
2486a29ce69SStefano Zampini #else
2496a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
2506a29ce69SStefano Zampini #endif
2516a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
2526a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
2536a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
2546a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2556a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
2566a29ce69SStefano Zampini   if (!b->A) {
2579ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2589ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2593bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2606a29ce69SStefano Zampini   }
2616a29ce69SStefano Zampini   if (!b->B) {
2626a29ce69SStefano Zampini     PetscMPIInt size;
26355b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
2649ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
2656a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2663bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2679ae82921SPaul Mullowney   }
268d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
269d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2706a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2716a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2729ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2739ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
274e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
275e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
276b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
277b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
278a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
279b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
280b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
281a0e72f99SJunchao Zhang   */
2829ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2839ae82921SPaul Mullowney   PetscFunctionReturn(0);
2849ae82921SPaul Mullowney }
285e057df02SPaul Mullowney 
2869ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2879ae82921SPaul Mullowney {
2889ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2899ae82921SPaul Mullowney   PetscErrorCode ierr;
2909ae82921SPaul Mullowney   PetscInt       nt;
2919ae82921SPaul Mullowney 
2929ae82921SPaul Mullowney   PetscFunctionBegin;
2939ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
2949ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
29556258e06SRichard Tran Mills   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
29656258e06SRichard Tran Mills   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
2979ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2984d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
2999ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3009ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
3019ae82921SPaul Mullowney   PetscFunctionReturn(0);
3029ae82921SPaul Mullowney }
303ca45077fSPaul Mullowney 
3043fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
3053fa6b06aSMark Adams {
3063fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3073fa6b06aSMark Adams   PetscErrorCode ierr;
3083fa6b06aSMark Adams 
3093fa6b06aSMark Adams   PetscFunctionBegin;
3103fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3113fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3123fa6b06aSMark Adams   PetscFunctionReturn(0);
3133fa6b06aSMark Adams }
314042217e8SBarry Smith 
315fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
316fdc842d1SBarry Smith {
317fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
318fdc842d1SBarry Smith   PetscErrorCode ierr;
319fdc842d1SBarry Smith   PetscInt       nt;
320fdc842d1SBarry Smith 
321fdc842d1SBarry Smith   PetscFunctionBegin;
322fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
323fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
324fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3254d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
326fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
327fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
328fdc842d1SBarry Smith   PetscFunctionReturn(0);
329fdc842d1SBarry Smith }
330fdc842d1SBarry Smith 
331ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
332ca45077fSPaul Mullowney {
333ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
334ca45077fSPaul Mullowney   PetscErrorCode ierr;
335ca45077fSPaul Mullowney   PetscInt       nt;
336ca45077fSPaul Mullowney 
337ca45077fSPaul Mullowney   PetscFunctionBegin;
338ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
339ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3409b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
341ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3429b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3439b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
344ca45077fSPaul Mullowney   PetscFunctionReturn(0);
345ca45077fSPaul Mullowney }
3469ae82921SPaul Mullowney 
347e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
348ca45077fSPaul Mullowney {
349ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
350bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
351e057df02SPaul Mullowney 
352ca45077fSPaul Mullowney   PetscFunctionBegin;
353e057df02SPaul Mullowney   switch (op) {
354e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
355e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
356045c96e1SPaul Mullowney     break;
357e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
358e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
359045c96e1SPaul Mullowney     break;
360e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
361e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
362e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
363045c96e1SPaul Mullowney     break;
364e057df02SPaul Mullowney   default:
365e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
366045c96e1SPaul Mullowney   }
367ca45077fSPaul Mullowney   PetscFunctionReturn(0);
368ca45077fSPaul Mullowney }
369e057df02SPaul Mullowney 
3704416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
371e057df02SPaul Mullowney {
372e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
373e057df02SPaul Mullowney   PetscErrorCode           ierr;
374e057df02SPaul Mullowney   PetscBool                flg;
375a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
376a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3775fd66863SKarl Rupp 
378e057df02SPaul Mullowney   PetscFunctionBegin;
379e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
380e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
381e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
382a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
383e057df02SPaul Mullowney     if (flg) {
384e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
385e057df02SPaul Mullowney     }
386e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
387a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
388e057df02SPaul Mullowney     if (flg) {
389e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
390e057df02SPaul Mullowney     }
391e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
392a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
393e057df02SPaul Mullowney     if (flg) {
394e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
395e057df02SPaul Mullowney     }
396e057df02SPaul Mullowney   }
3970af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
398e057df02SPaul Mullowney   PetscFunctionReturn(0);
399e057df02SPaul Mullowney }
400e057df02SPaul Mullowney 
40134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
40234d6c7a5SJose E. Roman {
40334d6c7a5SJose E. Roman   PetscErrorCode     ierr;
4043fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
405042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
406042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
407042217e8SBarry Smith 
40834d6c7a5SJose E. Roman   PetscFunctionBegin;
40934d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
410042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
411042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
412042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
413042217e8SBarry Smith     cudaError_t                cerr;
414042217e8SBarry Smith 
415042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
416042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
417042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
418042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
41999551766SMark Adams     if (h_mat->allocated_indices) {
42099551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
42199551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
42299551766SMark Adams       if (h_mat->offdiag.j) {
42399551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
42499551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
42599551766SMark Adams       }
42699551766SMark Adams     }
427042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
428042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
429042217e8SBarry Smith     cusp->deviceMat = NULL;
4303fa6b06aSMark Adams   }
43134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
43234d6c7a5SJose E. Roman }
43334d6c7a5SJose E. Roman 
434bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
435bbf3fe20SPaul Mullowney {
436bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4373fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4383fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
439b06137fdSPaul Mullowney   cusparseStatus_t   stat;
440bbf3fe20SPaul Mullowney 
441bbf3fe20SPaul Mullowney   PetscFunctionBegin;
442d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4433fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
444042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
445042217e8SBarry Smith     cudaError_t                cerr;
446042217e8SBarry Smith 
4473fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
448042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
449042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
450042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
45199551766SMark Adams     if (h_mat->allocated_indices) {
45299551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
45399551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
45499551766SMark Adams       if (h_mat->offdiag.j) {
45599551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
45699551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
45799551766SMark Adams       }
45899551766SMark Adams     }
459042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
460042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
4613fa6b06aSMark Adams   }
462bbf3fe20SPaul Mullowney   try {
4633fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4643fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
46557d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
466a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
46717403302SKarl Rupp     if (cusparseStruct->stream) {
468042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
46917403302SKarl Rupp     }
470a0e72f99SJunchao Zhang     */
4717e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
4727e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
473bbf3fe20SPaul Mullowney     delete cusparseStruct;
474bbf3fe20SPaul Mullowney   } catch(char *ex) {
475bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
476bbf3fe20SPaul Mullowney   }
4773338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
478ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
4797e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
4807e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4813338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
482ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
483bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
484bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
485bbf3fe20SPaul Mullowney }
486ca45077fSPaul Mullowney 
4873338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4889ae82921SPaul Mullowney {
4899ae82921SPaul Mullowney   PetscErrorCode     ierr;
490bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
491bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
492b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4933338378cSStefano Zampini   Mat                A;
4949ae82921SPaul Mullowney 
4959ae82921SPaul Mullowney   PetscFunctionBegin;
4963338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4973338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4983338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
4993338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
5003338378cSStefano Zampini   }
5013338378cSStefano Zampini   A = *newmat;
5026f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
50334136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
50434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
50534136279SStefano Zampini 
506bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
507d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
508d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
509d98d7c49SStefano Zampini   if (a->lvec) {
510d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
511d98d7c49SStefano Zampini   }
512d98d7c49SStefano Zampini 
5133338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
514bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
5152205254eSKarl Rupp 
516bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
517e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
518e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5197e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5207e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
521042217e8SBarry Smith     cusparseStruct->stream              = 0;
5223fa6b06aSMark Adams     cusparseStruct->deviceMat           = NULL;
523042217e8SBarry Smith     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
5243338378cSStefano Zampini   }
5252205254eSKarl Rupp 
52634d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
527bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
528fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
529bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
530bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
531bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
5323fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
5334e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
5342205254eSKarl Rupp 
535bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
536ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5373338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
538bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5397e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5407e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
541ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
542ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
543ae48a8d0SStefano Zampini #endif
5449ae82921SPaul Mullowney   PetscFunctionReturn(0);
5459ae82921SPaul Mullowney }
5469ae82921SPaul Mullowney 
5473338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5483338378cSStefano Zampini {
5493338378cSStefano Zampini   PetscErrorCode ierr;
5503338378cSStefano Zampini 
5513338378cSStefano Zampini   PetscFunctionBegin;
552a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
5533338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5543338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5553338378cSStefano Zampini   PetscFunctionReturn(0);
5563338378cSStefano Zampini }
5573338378cSStefano Zampini 
5583f9c0db1SPaul Mullowney /*@
5593f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
560e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5613f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
562e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
563e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
564e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5659ae82921SPaul Mullowney 
566d083f849SBarry Smith    Collective
567e057df02SPaul Mullowney 
568e057df02SPaul Mullowney    Input Parameters:
569e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
570e057df02SPaul Mullowney .  m - number of rows
571e057df02SPaul Mullowney .  n - number of columns
572e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
573e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5740298fd71SBarry Smith          (possibly different for each row) or NULL
575e057df02SPaul Mullowney 
576e057df02SPaul Mullowney    Output Parameter:
577e057df02SPaul Mullowney .  A - the matrix
578e057df02SPaul Mullowney 
579e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
580e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
581e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
582e057df02SPaul Mullowney 
583e057df02SPaul Mullowney    Notes:
584e057df02SPaul Mullowney    If nnz is given then nz is ignored
585e057df02SPaul Mullowney 
586e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
587e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
588e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
589e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
590e057df02SPaul Mullowney 
591e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5920298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
593e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
594e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
595e057df02SPaul Mullowney 
596e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
597e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
598e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
599e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
600e057df02SPaul Mullowney 
601e057df02SPaul Mullowney    Level: intermediate
602e057df02SPaul Mullowney 
603e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
604e057df02SPaul Mullowney @*/
6059ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
6069ae82921SPaul Mullowney {
6079ae82921SPaul Mullowney   PetscErrorCode ierr;
6089ae82921SPaul Mullowney   PetscMPIInt    size;
6099ae82921SPaul Mullowney 
6109ae82921SPaul Mullowney   PetscFunctionBegin;
6119ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6129ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
613ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
6149ae82921SPaul Mullowney   if (size > 1) {
6159ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
6169ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
6179ae82921SPaul Mullowney   } else {
6189ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6199ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6209ae82921SPaul Mullowney   }
6219ae82921SPaul Mullowney   PetscFunctionReturn(0);
6229ae82921SPaul Mullowney }
6239ae82921SPaul Mullowney 
6243ca39a21SBarry Smith /*MC
6256bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
626e057df02SPaul Mullowney 
6272692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6282692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6292692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6309ae82921SPaul Mullowney 
6319ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6329ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6339ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6349ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6359ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6369ae82921SPaul Mullowney 
6379ae82921SPaul Mullowney    Options Database Keys:
638e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6398468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6408468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6418468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6429ae82921SPaul Mullowney 
6439ae82921SPaul Mullowney   Level: beginner
6449ae82921SPaul Mullowney 
6456bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6466bb69460SJunchao Zhang M*/
6476bb69460SJunchao Zhang 
6486bb69460SJunchao Zhang /*MC
6496bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
6506bb69460SJunchao Zhang 
6516bb69460SJunchao Zhang   Level: beginner
6526bb69460SJunchao Zhang 
6536bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
6549ae82921SPaul Mullowney M*/
6553fa6b06aSMark Adams 
656042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat);
657042217e8SBarry Smith 
658042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
659042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
6603fa6b06aSMark Adams {
661042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
662042217e8SBarry Smith   PetscMPIInt                size;
6633fa6b06aSMark Adams   PetscErrorCode             ierr;
664042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
665042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
66699551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
667042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
668042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
6693fa6b06aSMark Adams 
6703fa6b06aSMark Adams   PetscFunctionBegin;
671042217e8SBarry Smith   if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
672042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
6733fa6b06aSMark Adams     *B = NULL;
6743fa6b06aSMark Adams     PetscFunctionReturn(0);
6753fa6b06aSMark Adams   }
676042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
67799551766SMark Adams   // get jaca
6783fa6b06aSMark Adams   if (size == 1) {
679042217e8SBarry Smith     PetscBool isseqaij;
680042217e8SBarry Smith 
681042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
682042217e8SBarry Smith     if (isseqaij) {
6833fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
684042217e8SBarry Smith       if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
685042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
686042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
687042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
6883fa6b06aSMark Adams     } else {
6893fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
690042217e8SBarry Smith       if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
691042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6923fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
693042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
694042217e8SBarry Smith       d_mat = spptr->deviceMat;
695042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
6963fa6b06aSMark Adams     }
697042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
698042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
699042217e8SBarry Smith       if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
700042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
701042217e8SBarry Smith       bi = NULL;
702042217e8SBarry Smith       bj = NULL;
703042217e8SBarry Smith       ba = NULL;
704042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
705042217e8SBarry Smith   } else {
706042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
707042217e8SBarry Smith     if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
708042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
70999551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
710042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
7113fa6b06aSMark Adams 
712042217e8SBarry Smith     if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
713042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
714042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
715042217e8SBarry Smith     if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
716042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
717042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
718042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
719042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
720042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
721042217e8SBarry Smith       if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
722042217e8SBarry Smith       if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
723042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
724042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
7253fa6b06aSMark Adams       if (jacb->compressedrow.use) {
726042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
727042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
728042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
7293fa6b06aSMark Adams         }
730042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
731042217e8SBarry Smith       } else {
732042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
733042217e8SBarry Smith       }
734042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
735042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
736042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
737042217e8SBarry Smith     d_mat = spptr->deviceMat;
738042217e8SBarry Smith   }
7393fa6b06aSMark Adams   if (jaca->compressedrow.use) {
740042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
741042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
742042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
7433fa6b06aSMark Adams     }
744042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
7453fa6b06aSMark Adams   } else {
746042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
7473fa6b06aSMark Adams   }
748042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
749042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
750042217e8SBarry Smith 
751042217e8SBarry Smith   if (!d_mat) {
752042217e8SBarry Smith     cudaError_t                cerr;
753042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
754042217e8SBarry Smith 
755042217e8SBarry Smith     // create and populate strucy on host and copy on device
756042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
757042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
758042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
759042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
760042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
76199551766SMark Adams       PetscInt   *colmap;
762042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
763042217e8SBarry Smith 
764b3c64f9dSJunchao Zhang       if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
765042217e8SBarry Smith 
766042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
767042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
768365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
769365b711fSMark Adams       { // have to make a long version of these
77099551766SMark Adams         int        *h_bi32, *h_bj32;
77199551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
772365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
77399551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
77499551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
77599551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
77699551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
77799551766SMark Adams 
77899551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
77999551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
78099551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
78199551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
78299551766SMark Adams 
78399551766SMark Adams         h_mat->offdiag.i = d_bi64;
78499551766SMark Adams         h_mat->offdiag.j = d_bj64;
78599551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
78699551766SMark Adams 
787365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
788365b711fSMark Adams       }
789365b711fSMark Adams #else
79099551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
79199551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
79299551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
793365b711fSMark Adams #endif
794042217e8SBarry Smith       h_mat->offdiag.a = ba;
795042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
796042217e8SBarry Smith 
79799551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
79899551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
799042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
800042217e8SBarry Smith     }
801042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
802042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
803042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
804042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
805*49b994a9SMark Adams     h_mat->M      = A->cmap->N;
806365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
807365b711fSMark Adams     {
80899551766SMark Adams       if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
80999551766SMark Adams       int        *h_ai32, *h_aj32;
81099551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
811365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
81299551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
81399551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
81499551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
81599551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
81699551766SMark Adams 
81799551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
81899551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
81999551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
82099551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
82199551766SMark Adams 
82299551766SMark Adams       h_mat->diag.i = d_ai64;
82399551766SMark Adams       h_mat->diag.j = d_aj64;
82499551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
82599551766SMark Adams 
826365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
827365b711fSMark Adams     }
828365b711fSMark Adams #else
82999551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
83099551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
83199551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
832365b711fSMark Adams #endif
833042217e8SBarry Smith     h_mat->diag.a = aa;
834042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
835042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
836042217e8SBarry Smith     // copy pointers and metadata to device
837042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
838042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
839042217e8SBarry Smith   } else {
840042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
841042217e8SBarry Smith   }
842042217e8SBarry Smith   *B = d_mat;
843042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
8443fa6b06aSMark Adams   PetscFunctionReturn(0);
8453fa6b06aSMark Adams }
846