xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 71438e86cf8dea9f708c08733b37fef8eb68dc06)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
104eb5378fSStefano Zampini #include <petscsf.h>
117e8381f9SStefano Zampini 
127e8381f9SStefano Zampini struct VecCUDAEquals
137e8381f9SStefano Zampini {
147e8381f9SStefano Zampini   template <typename Tuple>
157e8381f9SStefano Zampini   __host__ __device__
167e8381f9SStefano Zampini   void operator()(Tuple t)
177e8381f9SStefano Zampini   {
187e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
197e8381f9SStefano Zampini   }
207e8381f9SStefano Zampini };
217e8381f9SStefano Zampini 
227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
237e8381f9SStefano Zampini {
247e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
257e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
267e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
277e8381f9SStefano Zampini   PetscErrorCode     ierr;
287e8381f9SStefano Zampini   cudaError_t        cerr;
297e8381f9SStefano Zampini 
307e8381f9SStefano Zampini   PetscFunctionBegin;
31e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3208391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3308391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3408391a17SStefano Zampini 
3508391a17SStefano Zampini     if (isCudaMem(v)) {
3608391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3708391a17SStefano Zampini     } else {
38e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
39e61fc153SStefano Zampini       w->assign(v,v+n);
4008391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4108391a17SStefano Zampini       d_v = w->data();
4208391a17SStefano Zampini     }
4308391a17SStefano Zampini 
4408391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
457e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4608391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
477e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4808391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
497e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5008391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
5108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
537e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
567e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
607e8381f9SStefano Zampini   A->num_ass++;
617e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
627e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
634eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
647e8381f9SStefano Zampini   PetscFunctionReturn(0);
657e8381f9SStefano Zampini }
667e8381f9SStefano Zampini 
677e8381f9SStefano Zampini template <typename Tuple>
687e8381f9SStefano Zampini struct IsNotOffDiagT
697e8381f9SStefano Zampini {
707e8381f9SStefano Zampini   PetscInt _cstart,_cend;
717e8381f9SStefano Zampini 
727e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
737e8381f9SStefano Zampini   __host__ __device__
747e8381f9SStefano Zampini   inline bool operator()(Tuple t)
757e8381f9SStefano Zampini   {
767e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
777e8381f9SStefano Zampini   }
787e8381f9SStefano Zampini };
797e8381f9SStefano Zampini 
807e8381f9SStefano Zampini struct IsOffDiag
817e8381f9SStefano Zampini {
827e8381f9SStefano Zampini   PetscInt _cstart,_cend;
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
857e8381f9SStefano Zampini   __host__ __device__
867e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
877e8381f9SStefano Zampini   {
887e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
927e8381f9SStefano Zampini struct GlobToLoc
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _start;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return c - _start;
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
1047e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1077e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1087e8381f9SStefano Zampini   PetscErrorCode         ierr;
1097e8381f9SStefano Zampini   PetscInt               *jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
1117e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini   cudaError_t            cerr;
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini   PetscFunctionBegin;
1177e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1187e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1197e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1207e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1217e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1237e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1247e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1257e8381f9SStefano Zampini 
1267e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1277e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1287e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1297e8381f9SStefano Zampini   delete cusp->coo_p;
1307e8381f9SStefano Zampini   delete cusp->coo_pw;
1317e8381f9SStefano Zampini   cusp->coo_p = NULL;
1327e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13308391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1347e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1357e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1367e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1377e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1387e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1397e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1407e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1417e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1427e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1437e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1447e8381f9SStefano Zampini   }
1457e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1467e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1477e8381f9SStefano Zampini 
1487e8381f9SStefano Zampini   /* from global to local */
1497e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1507e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
15108391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
15208391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1537e8381f9SStefano Zampini 
1547e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1557e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1567e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1577e8381f9SStefano Zampini   auto o_j = d_j.begin();
15808391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1597e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1607e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1617e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
16208391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
16308391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1647e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1657e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1667e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1677e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1687e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1697e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1707e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1727e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1737e8381f9SStefano Zampini 
1747e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1827e8381f9SStefano Zampini 
1837e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1847e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1877e8381f9SStefano Zampini 
1887e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1907e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
192a0e72f99SJunchao Zhang   /*
1937e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
195a0e72f99SJunchao Zhang   */
1967e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1987e8381f9SStefano Zampini   B->nonzerostate++;
1997e8381f9SStefano Zampini 
2007e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2017e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2027e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2037e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2047e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2057e8381f9SStefano Zampini   PetscFunctionReturn(0);
2067e8381f9SStefano Zampini }
2079ae82921SPaul Mullowney 
208ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
209ed502f03SStefano Zampini {
210ed502f03SStefano Zampini   Mat            Ad,Ao;
211ed502f03SStefano Zampini   const PetscInt *cmap;
212ed502f03SStefano Zampini   PetscErrorCode ierr;
213ed502f03SStefano Zampini 
214ed502f03SStefano Zampini   PetscFunctionBegin;
215ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
216ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
217ed502f03SStefano Zampini   if (glob) {
218ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
219ed502f03SStefano Zampini 
220ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
221ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
222ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
223ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
224ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
225ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
226ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
227ed502f03SStefano Zampini   }
228ed502f03SStefano Zampini   PetscFunctionReturn(0);
229ed502f03SStefano Zampini }
230ed502f03SStefano Zampini 
2319ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2329ae82921SPaul Mullowney {
233bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
234bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2359ae82921SPaul Mullowney   PetscErrorCode     ierr;
2369ae82921SPaul Mullowney   PetscInt           i;
2379ae82921SPaul Mullowney 
2389ae82921SPaul Mullowney   PetscFunctionBegin;
2399ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2409ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2417e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2429ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2439ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2449ae82921SPaul Mullowney     }
2459ae82921SPaul Mullowney   }
2467e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2479ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2489ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2499ae82921SPaul Mullowney     }
2509ae82921SPaul Mullowney   }
2516a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
2526a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
2536a29ce69SStefano Zampini #else
2546a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
2556a29ce69SStefano Zampini #endif
2566a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
2576a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
2586a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
2596a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2606a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
2616a29ce69SStefano Zampini   if (!b->A) {
2629ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2639ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2643bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2656a29ce69SStefano Zampini   }
2666a29ce69SStefano Zampini   if (!b->B) {
2676a29ce69SStefano Zampini     PetscMPIInt size;
26855b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
2699ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
2706a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2713bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2729ae82921SPaul Mullowney   }
273d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
274d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2756a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2766a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2779ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2789ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
279e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
280e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
281b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
282b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
283a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
284b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
285b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
286a0e72f99SJunchao Zhang   */
2879ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2889ae82921SPaul Mullowney   PetscFunctionReturn(0);
2899ae82921SPaul Mullowney }
290e057df02SPaul Mullowney 
291e589036eSStefano Zampini /*@
292e589036eSStefano Zampini    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
293e589036eSStefano Zampini 
294e589036eSStefano Zampini    Not collective
295e589036eSStefano Zampini 
296e589036eSStefano Zampini    Input Parameters:
297e589036eSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
298e589036eSStefano Zampini -  gen - the boolean flag
299e589036eSStefano Zampini 
300e589036eSStefano Zampini    Level: intermediate
301e589036eSStefano Zampini 
302e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
303e589036eSStefano Zampini @*/
304e589036eSStefano Zampini PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
305e589036eSStefano Zampini {
306e589036eSStefano Zampini   PetscErrorCode ierr;
307e589036eSStefano Zampini   PetscBool      ismpiaij;
308e589036eSStefano Zampini 
309e589036eSStefano Zampini   PetscFunctionBegin;
310e589036eSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
311e589036eSStefano Zampini   MatCheckPreallocated(A,1);
312e589036eSStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
313e589036eSStefano Zampini   if (ismpiaij) {
314e589036eSStefano Zampini     Mat A_d,A_o;
315e589036eSStefano Zampini 
316e589036eSStefano Zampini     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
317e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
318e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
319e589036eSStefano Zampini   } else {
320e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
321e589036eSStefano Zampini   }
322e589036eSStefano Zampini   PetscFunctionReturn(0);
323e589036eSStefano Zampini }
324e589036eSStefano Zampini 
3259ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
3269ae82921SPaul Mullowney {
3279ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
3289ae82921SPaul Mullowney   PetscErrorCode ierr;
3299ae82921SPaul Mullowney   PetscInt       nt;
3309ae82921SPaul Mullowney 
3319ae82921SPaul Mullowney   PetscFunctionBegin;
3329ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
3339ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
3349ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3354d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
3369ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3379ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
3389ae82921SPaul Mullowney   PetscFunctionReturn(0);
3399ae82921SPaul Mullowney }
340ca45077fSPaul Mullowney 
3413fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
3423fa6b06aSMark Adams {
3433fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
3443fa6b06aSMark Adams   PetscErrorCode ierr;
3453fa6b06aSMark Adams 
3463fa6b06aSMark Adams   PetscFunctionBegin;
3473fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
3483fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
3493fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
3503fa6b06aSMark Adams     if (d_mat) {
3513fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
3523fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
3533fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
3543fa6b06aSMark Adams       cudaError_t  err;
3553fa6b06aSMark Adams       PetscScalar  *vals;
3563fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
3573fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3583fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
3593fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3603fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
3613fa6b06aSMark Adams     }
3623fa6b06aSMark Adams   }
3633fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3643fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3653fa6b06aSMark Adams 
3663fa6b06aSMark Adams   PetscFunctionReturn(0);
3673fa6b06aSMark Adams }
368fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
369fdc842d1SBarry Smith {
370fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
371fdc842d1SBarry Smith   PetscErrorCode ierr;
372fdc842d1SBarry Smith   PetscInt       nt;
373fdc842d1SBarry Smith 
374fdc842d1SBarry Smith   PetscFunctionBegin;
375fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
376fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
377fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3784d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
379fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
380fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
381fdc842d1SBarry Smith   PetscFunctionReturn(0);
382fdc842d1SBarry Smith }
383fdc842d1SBarry Smith 
384ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
385ca45077fSPaul Mullowney {
386ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
387ca45077fSPaul Mullowney   PetscErrorCode ierr;
388ca45077fSPaul Mullowney   PetscInt       nt;
389ca45077fSPaul Mullowney 
390ca45077fSPaul Mullowney   PetscFunctionBegin;
391ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
392ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3939b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
394ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3959b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3969b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
397ca45077fSPaul Mullowney   PetscFunctionReturn(0);
398ca45077fSPaul Mullowney }
3999ae82921SPaul Mullowney 
400e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
401ca45077fSPaul Mullowney {
402ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
403bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
404e057df02SPaul Mullowney 
405ca45077fSPaul Mullowney   PetscFunctionBegin;
406e057df02SPaul Mullowney   switch (op) {
407e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
408e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
409045c96e1SPaul Mullowney     break;
410e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
411e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
412045c96e1SPaul Mullowney     break;
413e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
414e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
415e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
416045c96e1SPaul Mullowney     break;
417e057df02SPaul Mullowney   default:
418e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
419045c96e1SPaul Mullowney   }
420ca45077fSPaul Mullowney   PetscFunctionReturn(0);
421ca45077fSPaul Mullowney }
422e057df02SPaul Mullowney 
4234416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
424e057df02SPaul Mullowney {
425e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
426e057df02SPaul Mullowney   PetscErrorCode           ierr;
427e057df02SPaul Mullowney   PetscBool                flg;
428a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
429a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
4305fd66863SKarl Rupp 
431e057df02SPaul Mullowney   PetscFunctionBegin;
432e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
433e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
434e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
435a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
436e057df02SPaul Mullowney     if (flg) {
437e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
438e057df02SPaul Mullowney     }
439e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
440a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
441e057df02SPaul Mullowney     if (flg) {
442e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
443e057df02SPaul Mullowney     }
444e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
445a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
446e057df02SPaul Mullowney     if (flg) {
447e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
448e057df02SPaul Mullowney     }
449e057df02SPaul Mullowney   }
4500af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
451e057df02SPaul Mullowney   PetscFunctionReturn(0);
452e057df02SPaul Mullowney }
453e057df02SPaul Mullowney 
45434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
45534d6c7a5SJose E. Roman {
45634d6c7a5SJose E. Roman   PetscErrorCode             ierr;
4573fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
4583fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
4593fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
46034d6c7a5SJose E. Roman   PetscFunctionBegin;
46134d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
46234d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
46334d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
464*71438e86SJunchao Zhang    #if defined(PETSC_HAVE_NVSHMEM)
465*71438e86SJunchao Zhang     {
466*71438e86SJunchao Zhang       PetscMPIInt result;
467*71438e86SJunchao Zhang       PetscBool   useNvshmem = PETSC_FALSE;
468*71438e86SJunchao Zhang       ierr = PetscOptionsGetBool(NULL,NULL,"-use_nvshmem",&useNvshmem,NULL);CHKERRQ(ierr);
469*71438e86SJunchao Zhang       if (useNvshmem) {
470*71438e86SJunchao Zhang         ierr = MPI_Comm_compare(PETSC_COMM_WORLD,PetscObjectComm((PetscObject)A),&result);CHKERRMPI(ierr);
471*71438e86SJunchao Zhang         if (result == MPI_IDENT || result == MPI_CONGRUENT) {ierr = VecAllocateNVSHMEM_SeqCUDA(mpiaij->lvec);CHKERRQ(ierr);}
472*71438e86SJunchao Zhang       }
473*71438e86SJunchao Zhang     }
474*71438e86SJunchao Zhang    #endif
47534d6c7a5SJose E. Roman   }
476a587d139SMark   if (d_mat) {
4773fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
4783fa6b06aSMark Adams   }
47934d6c7a5SJose E. Roman   PetscFunctionReturn(0);
48034d6c7a5SJose E. Roman }
48134d6c7a5SJose E. Roman 
482bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
483bbf3fe20SPaul Mullowney {
484bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4853fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4863fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
487b06137fdSPaul Mullowney   cudaError_t        err;
488b06137fdSPaul Mullowney   cusparseStatus_t   stat;
489bbf3fe20SPaul Mullowney 
490bbf3fe20SPaul Mullowney   PetscFunctionBegin;
491d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4923fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
4933fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
4943fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
4953fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
4963fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
4973fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
4983fa6b06aSMark Adams     if (jaca->compressedrow.use) {
4993fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
5003fa6b06aSMark Adams     }
5013fa6b06aSMark Adams     if (jacb->compressedrow.use) {
5023fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
5033fa6b06aSMark Adams     }
5043fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
5053fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
5063fa6b06aSMark Adams   }
507bbf3fe20SPaul Mullowney   try {
5083fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
5093fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
51057d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
511a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
51217403302SKarl Rupp     if (cusparseStruct->stream) {
513c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
51417403302SKarl Rupp     }
515a0e72f99SJunchao Zhang     */
5167e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
5177e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
518bbf3fe20SPaul Mullowney     delete cusparseStruct;
519bbf3fe20SPaul Mullowney   } catch(char *ex) {
520bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
521bbf3fe20SPaul Mullowney   }
5223338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
523ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
5247e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
5257e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
5263338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
527bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
528bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
529bbf3fe20SPaul Mullowney }
530ca45077fSPaul Mullowney 
5313338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
5329ae82921SPaul Mullowney {
5339ae82921SPaul Mullowney   PetscErrorCode     ierr;
534bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
535bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
536b06137fdSPaul Mullowney   cusparseStatus_t   stat;
5373338378cSStefano Zampini   Mat                A;
5389ae82921SPaul Mullowney 
5399ae82921SPaul Mullowney   PetscFunctionBegin;
5403338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
5413338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
5423338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
5433338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
5443338378cSStefano Zampini   }
5453338378cSStefano Zampini   A = *newmat;
5466f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
54734136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
54834136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
54934136279SStefano Zampini 
550bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
551d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
552d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
553d98d7c49SStefano Zampini   if (a->lvec) {
554d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
555d98d7c49SStefano Zampini   }
556d98d7c49SStefano Zampini 
5573338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
558bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
5592205254eSKarl Rupp 
560bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
561e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
562e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
5637e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
5647e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
565a0e72f99SJunchao Zhang     cusparseStruct->stream              = 0; /* We should not need cusparseStruct->stream */
56657d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
567a0e72f99SJunchao Zhang     stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat);
5683fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
5693338378cSStefano Zampini   }
5702205254eSKarl Rupp 
57134d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
572bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
573fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
574bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
575bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
576bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
5773fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
5784e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
5792205254eSKarl Rupp 
580bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
581ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5823338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
583bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5847e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5857e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5869ae82921SPaul Mullowney   PetscFunctionReturn(0);
5879ae82921SPaul Mullowney }
5889ae82921SPaul Mullowney 
5893338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5903338378cSStefano Zampini {
5913338378cSStefano Zampini   PetscErrorCode ierr;
5923338378cSStefano Zampini 
5933338378cSStefano Zampini   PetscFunctionBegin;
59405035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
5953338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5963338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5973338378cSStefano Zampini   PetscFunctionReturn(0);
5983338378cSStefano Zampini }
5993338378cSStefano Zampini 
6003f9c0db1SPaul Mullowney /*@
6013f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
602e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
6033f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
604e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
605e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
606e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
6079ae82921SPaul Mullowney 
608d083f849SBarry Smith    Collective
609e057df02SPaul Mullowney 
610e057df02SPaul Mullowney    Input Parameters:
611e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
612e057df02SPaul Mullowney .  m - number of rows
613e057df02SPaul Mullowney .  n - number of columns
614e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
615e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
6160298fd71SBarry Smith          (possibly different for each row) or NULL
617e057df02SPaul Mullowney 
618e057df02SPaul Mullowney    Output Parameter:
619e057df02SPaul Mullowney .  A - the matrix
620e057df02SPaul Mullowney 
621e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
622e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
623e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
624e057df02SPaul Mullowney 
625e057df02SPaul Mullowney    Notes:
626e057df02SPaul Mullowney    If nnz is given then nz is ignored
627e057df02SPaul Mullowney 
628e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
629e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
630e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
631e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
632e057df02SPaul Mullowney 
633e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
6340298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
635e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
636e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
637e057df02SPaul Mullowney 
638e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
639e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
640e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
641e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
642e057df02SPaul Mullowney 
643e057df02SPaul Mullowney    Level: intermediate
644e057df02SPaul Mullowney 
645e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
646e057df02SPaul Mullowney @*/
6479ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
6489ae82921SPaul Mullowney {
6499ae82921SPaul Mullowney   PetscErrorCode ierr;
6509ae82921SPaul Mullowney   PetscMPIInt    size;
6519ae82921SPaul Mullowney 
6529ae82921SPaul Mullowney   PetscFunctionBegin;
6539ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
6549ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
655ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
6569ae82921SPaul Mullowney   if (size > 1) {
6579ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
6589ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
6599ae82921SPaul Mullowney   } else {
6609ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
6619ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
6629ae82921SPaul Mullowney   }
6639ae82921SPaul Mullowney   PetscFunctionReturn(0);
6649ae82921SPaul Mullowney }
6659ae82921SPaul Mullowney 
6663ca39a21SBarry Smith /*MC
667e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
668e057df02SPaul Mullowney 
6692692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
6702692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
6712692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
6729ae82921SPaul Mullowney 
6739ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6749ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6759ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6769ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6779ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6789ae82921SPaul Mullowney 
6799ae82921SPaul Mullowney    Options Database Keys:
680e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6818468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6828468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6838468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6849ae82921SPaul Mullowney 
6859ae82921SPaul Mullowney   Level: beginner
6869ae82921SPaul Mullowney 
6878468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6888468deeeSKarl Rupp M
6899ae82921SPaul Mullowney M*/
6903fa6b06aSMark Adams 
6913fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
6923fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
6933fa6b06aSMark Adams {
6949db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
6959db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
6969db3cbf9SStefano Zampini #else
6973fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
6983fa6b06aSMark Adams   PetscMPIInt                size,rank;
6993fa6b06aSMark Adams   MPI_Comm                   comm;
7003fa6b06aSMark Adams   PetscErrorCode             ierr;
7013fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
7023fa6b06aSMark Adams   PetscScalar                *aa,*ba;
7033fa6b06aSMark Adams 
7043fa6b06aSMark Adams   PetscFunctionBegin;
7053fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
706ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
707ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
7083fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
7093fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
7103fa6b06aSMark Adams     if (size == 1) {
7113fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
7123fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
7133fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
7143fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
7153fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
7163fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
7176b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
7183fa6b06aSMark Adams     } else {
7193fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
7203fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
7213fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
7223fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
7233fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
7243fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
7253fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
7263fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
7273fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
7283fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
7293fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
7303fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
7313fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
7323fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
7336b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
7343fa6b06aSMark Adams     }
7353fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
7363fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
7373fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
7383fa6b06aSMark Adams   } else {
7393fa6b06aSMark Adams     *B = NULL;
7403fa6b06aSMark Adams     PetscFunctionReturn(0);
7413fa6b06aSMark Adams   }
7423fa6b06aSMark Adams   // act like MatSetValues because not called on host
7433fa6b06aSMark Adams   if (A->assembled) {
7443fa6b06aSMark Adams     if (A->was_assembled) {
7453fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
7463fa6b06aSMark Adams     }
7473fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
7483fa6b06aSMark Adams   } else {
7493fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
7503fa6b06aSMark Adams   }
7513fa6b06aSMark Adams   if (!*p_d_mat) {
7523fa6b06aSMark Adams     cudaError_t                 err;
7533fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
7543fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
7553fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
7563fa6b06aSMark Adams     // create and copy
7573fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
7583fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7593fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
7603fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
7613fa6b06aSMark Adams     if (size == 1) {
7623fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
7633fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
7643fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
7653fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
7663fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
7673fa6b06aSMark Adams     } else {
7683fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
7693fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
7703fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
7713fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
7723fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
7733fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
7743fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
7753fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
7763fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
7773fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
7783fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
7793fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
7803fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
7813fa6b06aSMark Adams       {
7823fa6b06aSMark Adams         PetscInt ii;
7833fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
7843fa6b06aSMark Adams       }
7853fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
7863fa6b06aSMark Adams       // allocate B copy data
7873fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
7883fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
7893fa6b06aSMark Adams       nnz = jacb->i[n];
7903fa6b06aSMark Adams 
7913fa6b06aSMark Adams       if (jacb->compressedrow.use) {
7923fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7933fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7946b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
7953fa6b06aSMark Adams       h_mat.offdiag.j = bj;
7963fa6b06aSMark Adams       h_mat.offdiag.a = ba;
7973fa6b06aSMark Adams 
7983fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
7993fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
8003fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
8013fa6b06aSMark Adams       h_mat.offdiag.n = n;
8023fa6b06aSMark Adams     }
8033fa6b06aSMark Adams     // allocate A copy data
8043fa6b06aSMark Adams     nnz = jaca->i[n];
8053fa6b06aSMark Adams     h_mat.diag.n = n;
8063fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
807ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
8083fa6b06aSMark Adams     if (jaca->compressedrow.use) {
8093fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
8103fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
8113fa6b06aSMark Adams     } else {
8123fa6b06aSMark Adams       h_mat.diag.i = ai;
8133fa6b06aSMark Adams     }
8143fa6b06aSMark Adams     h_mat.diag.j = aj;
8153fa6b06aSMark Adams     h_mat.diag.a = aa;
8163fa6b06aSMark Adams     // copy pointers and metdata to device
8173fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
8183fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
8193fa6b06aSMark Adams   } else {
8203fa6b06aSMark Adams     *B = *p_d_mat;
8213fa6b06aSMark Adams   }
8223fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
8233fa6b06aSMark Adams   PetscFunctionReturn(0);
8249db3cbf9SStefano Zampini #endif
8253fa6b06aSMark Adams }
826