xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 6a29ce69e29aaf78b7325cac1ae663c7f0b163da)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
104eb5378fSStefano Zampini #include <petscsf.h>
117e8381f9SStefano Zampini 
127e8381f9SStefano Zampini struct VecCUDAEquals
137e8381f9SStefano Zampini {
147e8381f9SStefano Zampini   template <typename Tuple>
157e8381f9SStefano Zampini   __host__ __device__
167e8381f9SStefano Zampini   void operator()(Tuple t)
177e8381f9SStefano Zampini   {
187e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
197e8381f9SStefano Zampini   }
207e8381f9SStefano Zampini };
217e8381f9SStefano Zampini 
227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
237e8381f9SStefano Zampini {
247e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
257e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
267e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
277e8381f9SStefano Zampini   PetscErrorCode     ierr;
287e8381f9SStefano Zampini   cudaError_t        cerr;
297e8381f9SStefano Zampini 
307e8381f9SStefano Zampini   PetscFunctionBegin;
317e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
5208391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53e61fc153SStefano Zampini     delete w;
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
567e8381f9SStefano Zampini   } else {
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
597e8381f9SStefano Zampini   }
607e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
617e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
627e8381f9SStefano Zampini   A->num_ass++;
637e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
647e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
654eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
667e8381f9SStefano Zampini   PetscFunctionReturn(0);
677e8381f9SStefano Zampini }
687e8381f9SStefano Zampini 
697e8381f9SStefano Zampini template <typename Tuple>
707e8381f9SStefano Zampini struct IsNotOffDiagT
717e8381f9SStefano Zampini {
727e8381f9SStefano Zampini   PetscInt _cstart,_cend;
737e8381f9SStefano Zampini 
747e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
757e8381f9SStefano Zampini   __host__ __device__
767e8381f9SStefano Zampini   inline bool operator()(Tuple t)
777e8381f9SStefano Zampini   {
787e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
797e8381f9SStefano Zampini   }
807e8381f9SStefano Zampini };
817e8381f9SStefano Zampini 
827e8381f9SStefano Zampini struct IsOffDiag
837e8381f9SStefano Zampini {
847e8381f9SStefano Zampini   PetscInt _cstart,_cend;
857e8381f9SStefano Zampini 
867e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
877e8381f9SStefano Zampini   __host__ __device__
887e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
897e8381f9SStefano Zampini   {
907e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
917e8381f9SStefano Zampini   }
927e8381f9SStefano Zampini };
937e8381f9SStefano Zampini 
947e8381f9SStefano Zampini struct GlobToLoc
957e8381f9SStefano Zampini {
967e8381f9SStefano Zampini   PetscInt _start;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
997e8381f9SStefano Zampini   __host__ __device__
1007e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1017e8381f9SStefano Zampini   {
1027e8381f9SStefano Zampini     return c - _start;
1037e8381f9SStefano Zampini   }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1067e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1077e8381f9SStefano Zampini {
1087e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1097e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1107e8381f9SStefano Zampini   PetscErrorCode         ierr;
1117e8381f9SStefano Zampini   PetscInt               *jj;
1127e8381f9SStefano Zampini   size_t                 noff = 0;
1137e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1147e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1157e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1167e8381f9SStefano Zampini   cudaError_t            cerr;
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini   PetscFunctionBegin;
1197e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1237e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1247e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1257e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1267e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1277e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1287e8381f9SStefano Zampini 
1297e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1307e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1317e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1327e8381f9SStefano Zampini   delete cusp->coo_p;
1337e8381f9SStefano Zampini   delete cusp->coo_pw;
1347e8381f9SStefano Zampini   cusp->coo_p = NULL;
1357e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13608391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1377e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1387e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1397e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1407e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1417e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1427e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1437e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1447e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1457e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1467e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1477e8381f9SStefano Zampini   }
1487e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1497e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* from global to local */
1527e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1537e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
15408391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1567e8381f9SStefano Zampini 
1577e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1587e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1597e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1607e8381f9SStefano Zampini   auto o_j = d_j.begin();
16108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1627e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1637e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1647e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
16508391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
16608391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1677e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1687e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1697e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1707e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1757e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1767e8381f9SStefano Zampini 
1777e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1837e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1857e8381f9SStefano Zampini 
1867e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1877e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1907e8381f9SStefano Zampini 
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1997e8381f9SStefano Zampini   B->nonzerostate++;
2007e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
2017e8381f9SStefano Zampini 
2027e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2037e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2047e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2057e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2067e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2077e8381f9SStefano Zampini   PetscFunctionReturn(0);
2087e8381f9SStefano Zampini }
2099ae82921SPaul Mullowney 
210ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211ed502f03SStefano Zampini {
212ed502f03SStefano Zampini   Mat            Ad,Ao;
213ed502f03SStefano Zampini   const PetscInt *cmap;
214ed502f03SStefano Zampini   PetscErrorCode ierr;
215ed502f03SStefano Zampini 
216ed502f03SStefano Zampini   PetscFunctionBegin;
217ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219ed502f03SStefano Zampini   if (glob) {
220ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
221ed502f03SStefano Zampini 
222ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229ed502f03SStefano Zampini   }
230ed502f03SStefano Zampini   PetscFunctionReturn(0);
231ed502f03SStefano Zampini }
232ed502f03SStefano Zampini 
2339ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2349ae82921SPaul Mullowney {
235bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2379ae82921SPaul Mullowney   PetscErrorCode     ierr;
2389ae82921SPaul Mullowney   PetscInt           i;
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney   PetscFunctionBegin;
2419ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2429ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2437e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2449ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2459ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2469ae82921SPaul Mullowney     }
2479ae82921SPaul Mullowney   }
2487e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2499ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2509ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2519ae82921SPaul Mullowney     }
2529ae82921SPaul Mullowney   }
253*6a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
254*6a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
255*6a29ce69SStefano Zampini #else
256*6a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
257*6a29ce69SStefano Zampini #endif
258*6a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
259*6a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
260*6a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
261*6a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
262*6a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
263*6a29ce69SStefano Zampini   if (!b->A) {
2649ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
2659ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2663bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
267*6a29ce69SStefano Zampini   }
268*6a29ce69SStefano Zampini   if (!b->B) {
269*6a29ce69SStefano Zampini     PetscMPIInt size;
270*6a29ce69SStefano Zampini     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRQ(ierr);
2719ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
272*6a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
2733bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2749ae82921SPaul Mullowney   }
275d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
276d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
277*6a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
278*6a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2799ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2809ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
281e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
282e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
283b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
284b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
285b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
286b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2872205254eSKarl Rupp 
2889ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2899ae82921SPaul Mullowney   PetscFunctionReturn(0);
2909ae82921SPaul Mullowney }
291e057df02SPaul Mullowney 
2924eb5378fSStefano Zampini typedef struct {
2934eb5378fSStefano Zampini   Mat         *mp;    /* intermediate products */
2944eb5378fSStefano Zampini   PetscBool   *mptmp; /* is the intermediate product temporary ? */
2954eb5378fSStefano Zampini   PetscInt    cp;     /* number of intermediate products */
2964eb5378fSStefano Zampini 
2974eb5378fSStefano Zampini   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
2984eb5378fSStefano Zampini   PetscInt    *startsj_s,*startsj_r;
2994eb5378fSStefano Zampini   PetscScalar *bufa;
3004eb5378fSStefano Zampini   Mat         P_oth;
3014eb5378fSStefano Zampini 
3024eb5378fSStefano Zampini   /* may take advantage of merging product->B */
3034eb5378fSStefano Zampini   Mat Bloc;
3044eb5378fSStefano Zampini 
3054eb5378fSStefano Zampini   /* cusparse does not have support to split between symbolic and numeric phases
3064eb5378fSStefano Zampini      When api_user is true, we don't need to update the numerical values
3074eb5378fSStefano Zampini      of the temporary storage */
3084eb5378fSStefano Zampini   PetscBool reusesym;
3094eb5378fSStefano Zampini 
3104eb5378fSStefano Zampini   /* support for COO values insertion */
3114eb5378fSStefano Zampini   PetscScalar *coo_v,*coo_w;
3124eb5378fSStefano Zampini   PetscSF     sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */
3134eb5378fSStefano Zampini } MatMatMPIAIJCUSPARSE;
3144eb5378fSStefano Zampini 
3154eb5378fSStefano Zampini PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data)
3164eb5378fSStefano Zampini {
3174eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data;
3184eb5378fSStefano Zampini   PetscInt             i;
3194eb5378fSStefano Zampini   PetscErrorCode       ierr;
3204eb5378fSStefano Zampini 
3214eb5378fSStefano Zampini   PetscFunctionBegin;
3224eb5378fSStefano Zampini   ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr);
3234eb5378fSStefano Zampini   ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr);
3244eb5378fSStefano Zampini   ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr);
3254eb5378fSStefano Zampini   ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr);
3264eb5378fSStefano Zampini   ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr);
3274eb5378fSStefano Zampini   ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr);
3284eb5378fSStefano Zampini   ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr);
3294eb5378fSStefano Zampini   for (i = 0; i < mmdata->cp; i++) {
3304eb5378fSStefano Zampini     ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr);
3314eb5378fSStefano Zampini   }
3324eb5378fSStefano Zampini   ierr = PetscFree(mmdata->mp);CHKERRQ(ierr);
3334eb5378fSStefano Zampini   ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr);
3344eb5378fSStefano Zampini   ierr = PetscFree(mmdata);CHKERRQ(ierr);
3354eb5378fSStefano Zampini   PetscFunctionReturn(0);
3364eb5378fSStefano Zampini }
3374eb5378fSStefano Zampini 
3384eb5378fSStefano Zampini static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
3394eb5378fSStefano Zampini {
3404eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE *mmdata;
3414eb5378fSStefano Zampini   PetscScalar          *tmp;
3424eb5378fSStefano Zampini   PetscInt             i,n;
3434eb5378fSStefano Zampini   PetscErrorCode       ierr;
3444eb5378fSStefano Zampini 
3454eb5378fSStefano Zampini   PetscFunctionBegin;
3464eb5378fSStefano Zampini   MatCheckProduct(C,1);
3474eb5378fSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
3484eb5378fSStefano Zampini   mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data;
3494eb5378fSStefano Zampini   tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v;
3504eb5378fSStefano Zampini   if (!mmdata->reusesym) { /* update temporary matrices */
3514eb5378fSStefano Zampini     if (mmdata->P_oth) {
3524eb5378fSStefano Zampini       ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
3534eb5378fSStefano Zampini     }
3544eb5378fSStefano Zampini     if (mmdata->Bloc) {
3554eb5378fSStefano Zampini       ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr);
3564eb5378fSStefano Zampini     }
3574eb5378fSStefano Zampini   }
3584eb5378fSStefano Zampini   mmdata->reusesym = PETSC_FALSE;
3594eb5378fSStefano Zampini   for (i = 0, n = 0; i < mmdata->cp; i++) {
3604eb5378fSStefano Zampini     Mat_SeqAIJ        *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data;
3614eb5378fSStefano Zampini     const PetscScalar *vv;
3624eb5378fSStefano Zampini 
3634eb5378fSStefano Zampini     if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]);
3644eb5378fSStefano Zampini     ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr);
3654eb5378fSStefano Zampini     if (mmdata->mptmp[i]) continue;
3664eb5378fSStefano Zampini     /* TODO: add support for using GPU data directly */
3674eb5378fSStefano Zampini     ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
3684eb5378fSStefano Zampini     ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr);
3694eb5378fSStefano Zampini     ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
3704eb5378fSStefano Zampini     n   += mm->nz;
3714eb5378fSStefano Zampini   }
3724eb5378fSStefano Zampini   if (mmdata->sf) { /* offprocess insertion */
3734eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
3744eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
3754eb5378fSStefano Zampini   }
3764eb5378fSStefano Zampini   ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr);
3774eb5378fSStefano Zampini   PetscFunctionReturn(0);
3784eb5378fSStefano Zampini }
3794eb5378fSStefano Zampini 
3804eb5378fSStefano Zampini /* Pt * A or A * P */
3814eb5378fSStefano Zampini #define MAX_NUMBER_INTERMEDIATE 4
3824eb5378fSStefano Zampini static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
3834eb5378fSStefano Zampini {
3844eb5378fSStefano Zampini   Mat_Product            *product = C->product;
3854eb5378fSStefano Zampini   Mat                    A,P,mp[MAX_NUMBER_INTERMEDIATE];
3864eb5378fSStefano Zampini   Mat_MPIAIJ             *a,*p;
3874eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE   *mmdata;
3884eb5378fSStefano Zampini   ISLocalToGlobalMapping P_oth_l2g = NULL;
3894eb5378fSStefano Zampini   IS                     glob = NULL;
3904eb5378fSStefano Zampini   const PetscInt         *globidx,*P_oth_idx;
3914eb5378fSStefano Zampini   const PetscInt         *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE];
3924eb5378fSStefano Zampini   PetscInt               cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j;
3934eb5378fSStefano Zampini   MatProductType         ptype;
3944eb5378fSStefano Zampini   PetscBool              mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE;
3954eb5378fSStefano Zampini   PetscMPIInt            size;
3964eb5378fSStefano Zampini   PetscErrorCode         ierr;
3974eb5378fSStefano Zampini 
3984eb5378fSStefano Zampini   PetscFunctionBegin;
3994eb5378fSStefano Zampini   MatCheckProduct(C,1);
4004eb5378fSStefano Zampini   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
4014eb5378fSStefano Zampini   ptype = product->type;
4024eb5378fSStefano Zampini   if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
4034eb5378fSStefano Zampini   switch (ptype) {
4044eb5378fSStefano Zampini   case MATPRODUCT_AB:
4054eb5378fSStefano Zampini     A = product->A;
4064eb5378fSStefano Zampini     P = product->B;
4074eb5378fSStefano Zampini     m = A->rmap->n;
4084eb5378fSStefano Zampini     n = P->cmap->n;
4094eb5378fSStefano Zampini     M = A->rmap->N;
4104eb5378fSStefano Zampini     N = P->cmap->N;
4114eb5378fSStefano Zampini     break;
4124eb5378fSStefano Zampini   case MATPRODUCT_AtB:
4134eb5378fSStefano Zampini     P = product->A;
4144eb5378fSStefano Zampini     A = product->B;
4154eb5378fSStefano Zampini     m = P->cmap->n;
4164eb5378fSStefano Zampini     n = A->cmap->n;
4174eb5378fSStefano Zampini     M = P->cmap->N;
4184eb5378fSStefano Zampini     N = A->cmap->N;
4194eb5378fSStefano Zampini     hasoffproc = PETSC_TRUE;
4204eb5378fSStefano Zampini     break;
4214eb5378fSStefano Zampini   case MATPRODUCT_PtAP:
4224eb5378fSStefano Zampini     A = product->A;
4234eb5378fSStefano Zampini     P = product->B;
4244eb5378fSStefano Zampini     m = P->cmap->n;
4254eb5378fSStefano Zampini     n = P->cmap->n;
4264eb5378fSStefano Zampini     M = P->cmap->N;
4274eb5378fSStefano Zampini     N = P->cmap->N;
4284eb5378fSStefano Zampini     hasoffproc = PETSC_TRUE;
4294eb5378fSStefano Zampini     break;
4304eb5378fSStefano Zampini   default:
4314eb5378fSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
4324eb5378fSStefano Zampini   }
4334eb5378fSStefano Zampini   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr);
4344eb5378fSStefano Zampini   if (size == 1) hasoffproc = PETSC_FALSE;
4354eb5378fSStefano Zampini 
4364eb5378fSStefano Zampini   for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) {
4374eb5378fSStefano Zampini     mp[i]    = NULL;
4384eb5378fSStefano Zampini     mptmp[i] = PETSC_FALSE;
4394eb5378fSStefano Zampini     rmapt[i] = 0;
4404eb5378fSStefano Zampini     cmapt[i] = 0;
4414eb5378fSStefano Zampini     rmapa[i] = NULL;
4424eb5378fSStefano Zampini     cmapa[i] = NULL;
4434eb5378fSStefano Zampini   }
4444eb5378fSStefano Zampini 
4454eb5378fSStefano Zampini   a = (Mat_MPIAIJ*)A->data;
4464eb5378fSStefano Zampini   p = (Mat_MPIAIJ*)P->data;
4474eb5378fSStefano Zampini   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
4484eb5378fSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
4494eb5378fSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
4504eb5378fSStefano Zampini   ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
4514eb5378fSStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
4524eb5378fSStefano Zampini   mmdata->reusesym = product->api_user;
4534eb5378fSStefano Zampini   switch (ptype) {
4544eb5378fSStefano Zampini   case MATPRODUCT_AB: /* A * P */
4554eb5378fSStefano Zampini     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
4564eb5378fSStefano Zampini 
4574eb5378fSStefano Zampini     if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */
4584eb5378fSStefano Zampini       /* P is product->B */
4594eb5378fSStefano Zampini       ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
4604eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
4614eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
4624eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
4634eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
4644eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
4654eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
4664eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
4674eb5378fSStefano Zampini       ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
4684eb5378fSStefano Zampini       rmapt[cp] = 1;
4694eb5378fSStefano Zampini       cmapt[cp] = 2;
4704eb5378fSStefano Zampini       cmapa[cp] = globidx;
4714eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
4724eb5378fSStefano Zampini       cp++;
4734eb5378fSStefano Zampini     } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */
4744eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr);
4754eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
4764eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
4774eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
4784eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
4794eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
4804eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
4814eb5378fSStefano Zampini       rmapt[cp] = 1;
4824eb5378fSStefano Zampini       cmapt[cp] = 1;
4834eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
4844eb5378fSStefano Zampini       cp++;
4854eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr);
4864eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
4874eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
4884eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
4894eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
4904eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
4914eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
4924eb5378fSStefano Zampini       rmapt[cp] = 1;
4934eb5378fSStefano Zampini       cmapt[cp] = 2;
4944eb5378fSStefano Zampini       cmapa[cp] = p->garray;
4954eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
4964eb5378fSStefano Zampini       cp++;
4974eb5378fSStefano Zampini     }
4984eb5378fSStefano Zampini     if (mmdata->P_oth) {
4994eb5378fSStefano Zampini       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
5004eb5378fSStefano Zampini       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
5014eb5378fSStefano Zampini       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
5024eb5378fSStefano Zampini       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
5034eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
5044eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5054eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
5064eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5074eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5084eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5094eb5378fSStefano Zampini       rmapt[cp] = 1;
5104eb5378fSStefano Zampini       cmapt[cp] = 2;
5114eb5378fSStefano Zampini       cmapa[cp] = P_oth_idx;
5124eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
5134eb5378fSStefano Zampini       cp++;
5144eb5378fSStefano Zampini     }
5154eb5378fSStefano Zampini     break;
5164eb5378fSStefano Zampini   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
5174eb5378fSStefano Zampini     /* A is product->B */
5184eb5378fSStefano Zampini     ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
5194eb5378fSStefano Zampini     ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
5204eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
5214eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5224eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
5234eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5244eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5254eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5264eb5378fSStefano Zampini     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
5274eb5378fSStefano Zampini     rmapt[cp] = 1;
5284eb5378fSStefano Zampini     cmapt[cp] = 2;
5294eb5378fSStefano Zampini     cmapa[cp] = globidx;
5304eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
5314eb5378fSStefano Zampini     cp++;
5324eb5378fSStefano Zampini     ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
5334eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
5344eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5354eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
5364eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5374eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5384eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5394eb5378fSStefano Zampini     rmapt[cp] = 2;
5404eb5378fSStefano Zampini     rmapa[cp] = p->garray;
5414eb5378fSStefano Zampini     cmapt[cp] = 2;
5424eb5378fSStefano Zampini     cmapa[cp] = globidx;
5434eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
5444eb5378fSStefano Zampini     cp++;
5454eb5378fSStefano Zampini     break;
5464eb5378fSStefano Zampini   case MATPRODUCT_PtAP:
5474eb5378fSStefano Zampini     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
5484eb5378fSStefano Zampini     /* P is product->B */
5494eb5378fSStefano Zampini     ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
5504eb5378fSStefano Zampini     ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
5514eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr);
5524eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5534eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
5544eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5554eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5564eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5574eb5378fSStefano Zampini     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
5584eb5378fSStefano Zampini     rmapt[cp] = 2;
5594eb5378fSStefano Zampini     rmapa[cp] = globidx;
5604eb5378fSStefano Zampini     cmapt[cp] = 2;
5614eb5378fSStefano Zampini     cmapa[cp] = globidx;
5624eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
5634eb5378fSStefano Zampini     cp++;
5644eb5378fSStefano Zampini     if (mmdata->P_oth) {
5654eb5378fSStefano Zampini       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
5664eb5378fSStefano Zampini       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
5674eb5378fSStefano Zampini       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
5684eb5378fSStefano Zampini       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
5694eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
5704eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5714eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
5724eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5734eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5744eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5754eb5378fSStefano Zampini       mptmp[cp] = PETSC_TRUE;
5764eb5378fSStefano Zampini       cp++;
5774eb5378fSStefano Zampini       ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr);
5784eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
5794eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
5804eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
5814eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
5824eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
5834eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
5844eb5378fSStefano Zampini       rmapt[cp] = 2;
5854eb5378fSStefano Zampini       rmapa[cp] = globidx;
5864eb5378fSStefano Zampini       cmapt[cp] = 2;
5874eb5378fSStefano Zampini       cmapa[cp] = P_oth_idx;
5884eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
5894eb5378fSStefano Zampini       cp++;
5904eb5378fSStefano Zampini     }
5914eb5378fSStefano Zampini     break;
5924eb5378fSStefano Zampini   default:
5934eb5378fSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
5944eb5378fSStefano Zampini   }
5954eb5378fSStefano Zampini   ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr);
5964eb5378fSStefano Zampini   for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i];
5974eb5378fSStefano Zampini   ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr);
5984eb5378fSStefano Zampini   for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i];
5994eb5378fSStefano Zampini   mmdata->cp = cp;
6004eb5378fSStefano Zampini   C->product->data       = mmdata;
6014eb5378fSStefano Zampini   C->product->destroy    = MatDestroy_MatMatMPIAIJCUSPARSE;
6024eb5378fSStefano Zampini   C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
6034eb5378fSStefano Zampini 
6044eb5378fSStefano Zampini   /* prepare coo coordinates for values insertion */
6054eb5378fSStefano Zampini   ncoo = 0;
6064eb5378fSStefano Zampini   for (cp = 0; cp < mmdata->cp; cp++) {
6074eb5378fSStefano Zampini     Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data;
6084eb5378fSStefano Zampini     if (mptmp[cp]) continue;
6094eb5378fSStefano Zampini     ncoo += mm->nz;
6104eb5378fSStefano Zampini   }
6114eb5378fSStefano Zampini   ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr);
6124eb5378fSStefano Zampini   ncoo = 0;
6134eb5378fSStefano Zampini   for (cp = 0; cp < mmdata->cp; cp++) {
6144eb5378fSStefano Zampini     Mat_SeqAIJ     *mm = (Mat_SeqAIJ*)mp[cp]->data;
6154eb5378fSStefano Zampini     PetscInt       *coi = coo_i + ncoo;
6164eb5378fSStefano Zampini     PetscInt       *coj = coo_j + ncoo;
6174eb5378fSStefano Zampini     const PetscInt mr = mp[cp]->rmap->n;
6184eb5378fSStefano Zampini     const PetscInt *jj  = mm->j;
6194eb5378fSStefano Zampini     const PetscInt *ii  = mm->i;
6204eb5378fSStefano Zampini 
6214eb5378fSStefano Zampini     if (mptmp[cp]) continue;
6224eb5378fSStefano Zampini     /* rows coo */
6234eb5378fSStefano Zampini     if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen");
6244eb5378fSStefano Zampini     else if (rmapt[cp] == 1) { /* local to global for owned rows of  */
6254eb5378fSStefano Zampini       const PetscInt rs = C->rmap->rstart;
6264eb5378fSStefano Zampini       for (i = 0; i < mr; i++) {
6274eb5378fSStefano Zampini         const PetscInt gr = i + rs;
6284eb5378fSStefano Zampini         for (j = ii[i]; j < ii[i+1]; j++) {
6294eb5378fSStefano Zampini           coi[j] = gr;
6304eb5378fSStefano Zampini         }
6314eb5378fSStefano Zampini       }
6324eb5378fSStefano Zampini     } else { /* offprocess */
6334eb5378fSStefano Zampini       const PetscInt *rmap = rmapa[cp];
6344eb5378fSStefano Zampini       for (i = 0; i < mr; i++) {
6354eb5378fSStefano Zampini         const PetscInt gr = rmap[i];
6364eb5378fSStefano Zampini         for (j = ii[i]; j < ii[i+1]; j++) {
6374eb5378fSStefano Zampini           coi[j] = gr;
6384eb5378fSStefano Zampini         }
6394eb5378fSStefano Zampini       }
6404eb5378fSStefano Zampini     }
6414eb5378fSStefano Zampini     /* columns coo */
6424eb5378fSStefano Zampini     if (!cmapt[cp]) {
6434eb5378fSStefano Zampini       ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr);
6444eb5378fSStefano Zampini     } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */
6454eb5378fSStefano Zampini       const PetscInt cs = P->cmap->rstart;
6464eb5378fSStefano Zampini       for (j = 0; j < mm->nz; j++) {
6474eb5378fSStefano Zampini         coj[j] = jj[j] + cs;
6484eb5378fSStefano Zampini       }
6494eb5378fSStefano Zampini     } else { /* offdiag */
6504eb5378fSStefano Zampini       const PetscInt *cmap = cmapa[cp];
6514eb5378fSStefano Zampini       for (j = 0; j < mm->nz; j++) {
6524eb5378fSStefano Zampini         coj[j] = cmap[jj[j]];
6534eb5378fSStefano Zampini       }
6544eb5378fSStefano Zampini     }
6554eb5378fSStefano Zampini     ncoo += mm->nz;
6564eb5378fSStefano Zampini   }
6574eb5378fSStefano Zampini   if (glob) {
6584eb5378fSStefano Zampini     ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr);
6594eb5378fSStefano Zampini   }
6604eb5378fSStefano Zampini   ierr = ISDestroy(&glob);CHKERRQ(ierr);
6614eb5378fSStefano Zampini   if (P_oth_l2g) {
6624eb5378fSStefano Zampini     ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
6634eb5378fSStefano Zampini   }
6644eb5378fSStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr);
6654eb5378fSStefano Zampini 
6664eb5378fSStefano Zampini   if (hasoffproc) { /* offproc values insertion */
6674eb5378fSStefano Zampini     const PetscInt *sfdeg;
6684eb5378fSStefano Zampini     const PetscInt n = P->cmap->n;
6694eb5378fSStefano Zampini     PetscInt ncoo2,*coo_i2,*coo_j2;
6704eb5378fSStefano Zampini 
6714eb5378fSStefano Zampini     ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr);
6724eb5378fSStefano Zampini     ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr);
6734eb5378fSStefano Zampini     ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr);
6744eb5378fSStefano Zampini     ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr);
6754eb5378fSStefano Zampini     for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i];
6764eb5378fSStefano Zampini     ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr);
6774eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
6784eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
6794eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
6804eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
6814eb5378fSStefano Zampini     ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
6824eb5378fSStefano Zampini     ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr);
6834eb5378fSStefano Zampini     coo_i = coo_i2;
6844eb5378fSStefano Zampini     coo_j = coo_j2;
6854eb5378fSStefano Zampini     ncoo  = ncoo2;
6864eb5378fSStefano Zampini   }
6874eb5378fSStefano Zampini 
6884eb5378fSStefano Zampini   /* preallocate with COO data */
6894eb5378fSStefano Zampini   ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr);
6904eb5378fSStefano Zampini   ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr);
6914eb5378fSStefano Zampini   ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
6924eb5378fSStefano Zampini   PetscFunctionReturn(0);
6934eb5378fSStefano Zampini }
6944eb5378fSStefano Zampini 
6954eb5378fSStefano Zampini static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat)
6964eb5378fSStefano Zampini {
6974eb5378fSStefano Zampini   Mat_Product    *product = mat->product;
6984eb5378fSStefano Zampini   PetscErrorCode ierr;
6994eb5378fSStefano Zampini   PetscBool      Biscusp = PETSC_FALSE;
7004eb5378fSStefano Zampini 
7014eb5378fSStefano Zampini   PetscFunctionBegin;
7024eb5378fSStefano Zampini   MatCheckProduct(mat,1);
7034eb5378fSStefano Zampini   if (!product->B->boundtocpu) {
7044eb5378fSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
7054eb5378fSStefano Zampini   }
7064eb5378fSStefano Zampini   if (Biscusp) {
7074eb5378fSStefano Zampini     switch (product->type) {
7084eb5378fSStefano Zampini     case MATPRODUCT_AB:
7094eb5378fSStefano Zampini     case MATPRODUCT_AtB:
7104eb5378fSStefano Zampini     case MATPRODUCT_PtAP:
7114eb5378fSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
7124eb5378fSStefano Zampini       break;
7134eb5378fSStefano Zampini     default:
7144eb5378fSStefano Zampini       break;
7154eb5378fSStefano Zampini     }
7164eb5378fSStefano Zampini   }
7174eb5378fSStefano Zampini   if (!mat->ops->productsymbolic) {
7184eb5378fSStefano Zampini     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
7194eb5378fSStefano Zampini   }
7204eb5378fSStefano Zampini   PetscFunctionReturn(0);
7214eb5378fSStefano Zampini }
7224eb5378fSStefano Zampini 
723e589036eSStefano Zampini /*@
724e589036eSStefano Zampini    MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose
725e589036eSStefano Zampini 
726e589036eSStefano Zampini    Not collective
727e589036eSStefano Zampini 
728e589036eSStefano Zampini    Input Parameters:
729e589036eSStefano Zampini +  A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE
730e589036eSStefano Zampini -  gen - the boolean flag
731e589036eSStefano Zampini 
732e589036eSStefano Zampini    Level: intermediate
733e589036eSStefano Zampini 
734e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE
735e589036eSStefano Zampini @*/
736e589036eSStefano Zampini PetscErrorCode  MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen)
737e589036eSStefano Zampini {
738e589036eSStefano Zampini   PetscErrorCode ierr;
739e589036eSStefano Zampini   PetscBool      ismpiaij;
740e589036eSStefano Zampini 
741e589036eSStefano Zampini   PetscFunctionBegin;
742e589036eSStefano Zampini   PetscValidHeaderSpecific(A,MAT_CLASSID,1);
743e589036eSStefano Zampini   MatCheckPreallocated(A,1);
744e589036eSStefano Zampini   ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr);
745e589036eSStefano Zampini   if (ismpiaij) {
746e589036eSStefano Zampini     Mat A_d,A_o;
747e589036eSStefano Zampini 
748e589036eSStefano Zampini     ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr);
749e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr);
750e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr);
751e589036eSStefano Zampini   } else {
752e589036eSStefano Zampini     ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr);
753e589036eSStefano Zampini   }
754e589036eSStefano Zampini   PetscFunctionReturn(0);
755e589036eSStefano Zampini }
756e589036eSStefano Zampini 
7579ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
7589ae82921SPaul Mullowney {
7599ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
7609ae82921SPaul Mullowney   PetscErrorCode ierr;
7619ae82921SPaul Mullowney   PetscInt       nt;
7629ae82921SPaul Mullowney 
7639ae82921SPaul Mullowney   PetscFunctionBegin;
7649ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
7659ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
7669ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7674d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
7689ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7699ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
7709ae82921SPaul Mullowney   PetscFunctionReturn(0);
7719ae82921SPaul Mullowney }
772ca45077fSPaul Mullowney 
7733fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
7743fa6b06aSMark Adams {
7753fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
7763fa6b06aSMark Adams   PetscErrorCode ierr;
7773fa6b06aSMark Adams 
7783fa6b06aSMark Adams   PetscFunctionBegin;
7793fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
7803fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
7813fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
7823fa6b06aSMark Adams     if (d_mat) {
7833fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
7843fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
7853fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
7863fa6b06aSMark Adams       cudaError_t  err;
7873fa6b06aSMark Adams       PetscScalar  *vals;
7883fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
7893fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
7903fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
7913fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
7923fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
7933fa6b06aSMark Adams     }
7943fa6b06aSMark Adams   }
7953fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
7963fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
7973fa6b06aSMark Adams 
7983fa6b06aSMark Adams   PetscFunctionReturn(0);
7993fa6b06aSMark Adams }
800fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
801fdc842d1SBarry Smith {
802fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
803fdc842d1SBarry Smith   PetscErrorCode ierr;
804fdc842d1SBarry Smith   PetscInt       nt;
805fdc842d1SBarry Smith 
806fdc842d1SBarry Smith   PetscFunctionBegin;
807fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
808fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
809fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
8104d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
811fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
812fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
813fdc842d1SBarry Smith   PetscFunctionReturn(0);
814fdc842d1SBarry Smith }
815fdc842d1SBarry Smith 
816ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
817ca45077fSPaul Mullowney {
818ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
819ca45077fSPaul Mullowney   PetscErrorCode ierr;
820ca45077fSPaul Mullowney   PetscInt       nt;
821ca45077fSPaul Mullowney 
822ca45077fSPaul Mullowney   PetscFunctionBegin;
823ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
824ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
8259b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
826ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
8279b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
8289b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
829ca45077fSPaul Mullowney   PetscFunctionReturn(0);
830ca45077fSPaul Mullowney }
8319ae82921SPaul Mullowney 
832e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
833ca45077fSPaul Mullowney {
834ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
835bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
836e057df02SPaul Mullowney 
837ca45077fSPaul Mullowney   PetscFunctionBegin;
838e057df02SPaul Mullowney   switch (op) {
839e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
840e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
841045c96e1SPaul Mullowney     break;
842e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
843e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
844045c96e1SPaul Mullowney     break;
845e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
846e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
847e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
848045c96e1SPaul Mullowney     break;
849e057df02SPaul Mullowney   default:
850e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
851045c96e1SPaul Mullowney   }
852ca45077fSPaul Mullowney   PetscFunctionReturn(0);
853ca45077fSPaul Mullowney }
854e057df02SPaul Mullowney 
8554416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
856e057df02SPaul Mullowney {
857e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
858e057df02SPaul Mullowney   PetscErrorCode           ierr;
859e057df02SPaul Mullowney   PetscBool                flg;
860a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
861a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
8625fd66863SKarl Rupp 
863e057df02SPaul Mullowney   PetscFunctionBegin;
864e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
865e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
866e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
867a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
868e057df02SPaul Mullowney     if (flg) {
869e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
870e057df02SPaul Mullowney     }
871e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
872a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
873e057df02SPaul Mullowney     if (flg) {
874e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
875e057df02SPaul Mullowney     }
876e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
877a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
878e057df02SPaul Mullowney     if (flg) {
879e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
880e057df02SPaul Mullowney     }
881e057df02SPaul Mullowney   }
8820af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
883e057df02SPaul Mullowney   PetscFunctionReturn(0);
884e057df02SPaul Mullowney }
885e057df02SPaul Mullowney 
88634d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
88734d6c7a5SJose E. Roman {
88834d6c7a5SJose E. Roman   PetscErrorCode             ierr;
8893fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
8903fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
8913fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
89234d6c7a5SJose E. Roman   PetscFunctionBegin;
89334d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
89434d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
89534d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
89634d6c7a5SJose E. Roman   }
897a587d139SMark   if (d_mat) {
8983fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
8993fa6b06aSMark Adams   }
9003fa6b06aSMark Adams 
90134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
90234d6c7a5SJose E. Roman }
90334d6c7a5SJose E. Roman 
904bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
905bbf3fe20SPaul Mullowney {
906bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
9073fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
9083fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
909b06137fdSPaul Mullowney   cudaError_t        err;
910b06137fdSPaul Mullowney   cusparseStatus_t   stat;
911bbf3fe20SPaul Mullowney 
912bbf3fe20SPaul Mullowney   PetscFunctionBegin;
913d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
9143fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
9153fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
9163fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
9173fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
9183fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
9193fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
9203fa6b06aSMark Adams     if (jaca->compressedrow.use) {
9213fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
9223fa6b06aSMark Adams     }
9233fa6b06aSMark Adams     if (jacb->compressedrow.use) {
9243fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
9253fa6b06aSMark Adams     }
9263fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
9273fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
9283fa6b06aSMark Adams   }
929bbf3fe20SPaul Mullowney   try {
9303fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
9313fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
93257d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
93317403302SKarl Rupp     if (cusparseStruct->stream) {
934c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
93517403302SKarl Rupp     }
9367e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
9377e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
938bbf3fe20SPaul Mullowney     delete cusparseStruct;
939bbf3fe20SPaul Mullowney   } catch(char *ex) {
940bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
941bbf3fe20SPaul Mullowney   }
9423338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
943ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
9447e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
9457e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
9463338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
947bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
948bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
949bbf3fe20SPaul Mullowney }
950ca45077fSPaul Mullowney 
9513338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
9529ae82921SPaul Mullowney {
9539ae82921SPaul Mullowney   PetscErrorCode     ierr;
954bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
955bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
956b06137fdSPaul Mullowney   cusparseStatus_t   stat;
9573338378cSStefano Zampini   Mat                A;
9589ae82921SPaul Mullowney 
9599ae82921SPaul Mullowney   PetscFunctionBegin;
9603338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
9613338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
9623338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
9633338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
9643338378cSStefano Zampini   }
9653338378cSStefano Zampini   A = *newmat;
9663338378cSStefano Zampini 
96734136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
96834136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
96934136279SStefano Zampini 
970bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
971d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
972d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
973d98d7c49SStefano Zampini   if (a->lvec) {
974d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
975d98d7c49SStefano Zampini   }
976d98d7c49SStefano Zampini 
9773338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
978bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
9792205254eSKarl Rupp 
980bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
981e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
982e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
9837e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
9847e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
98517403302SKarl Rupp     cusparseStruct->stream              = 0;
98657d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
9873fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
9883338378cSStefano Zampini   }
9892205254eSKarl Rupp 
99034d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
991bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
992fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
993bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
994bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
995bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
9963fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
9974eb5378fSStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE;
9982205254eSKarl Rupp 
999bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
1000ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
10013338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
1002bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
10037e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
10047e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
10059ae82921SPaul Mullowney   PetscFunctionReturn(0);
10069ae82921SPaul Mullowney }
10079ae82921SPaul Mullowney 
10083338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
10093338378cSStefano Zampini {
10103338378cSStefano Zampini   PetscErrorCode ierr;
10113338378cSStefano Zampini 
10123338378cSStefano Zampini   PetscFunctionBegin;
101305035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
10143338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
10153338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
10163338378cSStefano Zampini   PetscFunctionReturn(0);
10173338378cSStefano Zampini }
10183338378cSStefano Zampini 
10193f9c0db1SPaul Mullowney /*@
10203f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
1021e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
10223f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
1023e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
1024e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
1025e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
10269ae82921SPaul Mullowney 
1027d083f849SBarry Smith    Collective
1028e057df02SPaul Mullowney 
1029e057df02SPaul Mullowney    Input Parameters:
1030e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
1031e057df02SPaul Mullowney .  m - number of rows
1032e057df02SPaul Mullowney .  n - number of columns
1033e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
1034e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
10350298fd71SBarry Smith          (possibly different for each row) or NULL
1036e057df02SPaul Mullowney 
1037e057df02SPaul Mullowney    Output Parameter:
1038e057df02SPaul Mullowney .  A - the matrix
1039e057df02SPaul Mullowney 
1040e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
1041e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
1042e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
1043e057df02SPaul Mullowney 
1044e057df02SPaul Mullowney    Notes:
1045e057df02SPaul Mullowney    If nnz is given then nz is ignored
1046e057df02SPaul Mullowney 
1047e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
1048e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
1049e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
1050e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
1051e057df02SPaul Mullowney 
1052e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
10530298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1054e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
1055e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
1056e057df02SPaul Mullowney 
1057e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
1058e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
1059e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
1060e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
1061e057df02SPaul Mullowney 
1062e057df02SPaul Mullowney    Level: intermediate
1063e057df02SPaul Mullowney 
1064e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
1065e057df02SPaul Mullowney @*/
10669ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
10679ae82921SPaul Mullowney {
10689ae82921SPaul Mullowney   PetscErrorCode ierr;
10699ae82921SPaul Mullowney   PetscMPIInt    size;
10709ae82921SPaul Mullowney 
10719ae82921SPaul Mullowney   PetscFunctionBegin;
10729ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
10739ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1074ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
10759ae82921SPaul Mullowney   if (size > 1) {
10769ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
10779ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
10789ae82921SPaul Mullowney   } else {
10799ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
10809ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
10819ae82921SPaul Mullowney   }
10829ae82921SPaul Mullowney   PetscFunctionReturn(0);
10839ae82921SPaul Mullowney }
10849ae82921SPaul Mullowney 
10853ca39a21SBarry Smith /*MC
1086e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
1087e057df02SPaul Mullowney 
10882692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
10892692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
10902692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
10919ae82921SPaul Mullowney 
10929ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
10939ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
10949ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
10959ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
10969ae82921SPaul Mullowney    the above preallocation routines for simplicity.
10979ae82921SPaul Mullowney 
10989ae82921SPaul Mullowney    Options Database Keys:
1099e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
11008468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
11018468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
11028468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
11039ae82921SPaul Mullowney 
11049ae82921SPaul Mullowney   Level: beginner
11059ae82921SPaul Mullowney 
11068468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
11078468deeeSKarl Rupp M
11089ae82921SPaul Mullowney M*/
11093fa6b06aSMark Adams 
11103fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
11113fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
11123fa6b06aSMark Adams {
11139db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
11149db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
11159db3cbf9SStefano Zampini #else
11163fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
11173fa6b06aSMark Adams   PetscMPIInt                size,rank;
11183fa6b06aSMark Adams   MPI_Comm                   comm;
11193fa6b06aSMark Adams   PetscErrorCode             ierr;
11203fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
11213fa6b06aSMark Adams   PetscScalar                *aa,*ba;
11223fa6b06aSMark Adams 
11233fa6b06aSMark Adams   PetscFunctionBegin;
11243fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1125ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1126ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
11273fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
11283fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
11293fa6b06aSMark Adams     if (size == 1) {
11303fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
11313fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
11323fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
11333fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
11343fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
11353fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
11366b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
11373fa6b06aSMark Adams     } else {
11383fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
11393fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
11403fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
11413fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
11423fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
11433fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
11443fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
11453fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
11463fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
11473fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
11483fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
11493fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
11503fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
11513fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
11526b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
11533fa6b06aSMark Adams     }
11543fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
11553fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
11563fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
11573fa6b06aSMark Adams   } else {
11583fa6b06aSMark Adams     *B = NULL;
11593fa6b06aSMark Adams     PetscFunctionReturn(0);
11603fa6b06aSMark Adams   }
11613fa6b06aSMark Adams   // act like MatSetValues because not called on host
11623fa6b06aSMark Adams   if (A->assembled) {
11633fa6b06aSMark Adams     if (A->was_assembled) {
11643fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
11653fa6b06aSMark Adams     }
11663fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
11673fa6b06aSMark Adams   } else {
11683fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
11693fa6b06aSMark Adams   }
11703fa6b06aSMark Adams   if (!*p_d_mat) {
11713fa6b06aSMark Adams     cudaError_t                 err;
11723fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
11733fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
11743fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
11753fa6b06aSMark Adams     // create and copy
11763fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
11773fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
11783fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
11793fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
11803fa6b06aSMark Adams     if (size == 1) {
11813fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
11823fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
11833fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
11843fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
11853fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
11863fa6b06aSMark Adams     } else {
11873fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
11883fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
11893fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
11903fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
11913fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
11923fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
11933fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
11943fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
11953fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
11963fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
11973fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
11983fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
11993fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
12003fa6b06aSMark Adams       {
12013fa6b06aSMark Adams         PetscInt ii;
12023fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
12033fa6b06aSMark Adams       }
12043fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
12053fa6b06aSMark Adams       // allocate B copy data
12063fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
12073fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
12083fa6b06aSMark Adams       nnz = jacb->i[n];
12093fa6b06aSMark Adams 
12103fa6b06aSMark Adams       if (jacb->compressedrow.use) {
12113fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
12123fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
12136b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
12143fa6b06aSMark Adams       h_mat.offdiag.j = bj;
12153fa6b06aSMark Adams       h_mat.offdiag.a = ba;
12163fa6b06aSMark Adams 
12173fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
12183fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
12193fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
12203fa6b06aSMark Adams       h_mat.offdiag.n = n;
12213fa6b06aSMark Adams     }
12223fa6b06aSMark Adams     // allocate A copy data
12233fa6b06aSMark Adams     nnz = jaca->i[n];
12243fa6b06aSMark Adams     h_mat.diag.n = n;
12253fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1226ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
12273fa6b06aSMark Adams     if (jaca->compressedrow.use) {
12283fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
12293fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
12303fa6b06aSMark Adams     } else {
12313fa6b06aSMark Adams       h_mat.diag.i = ai;
12323fa6b06aSMark Adams     }
12333fa6b06aSMark Adams     h_mat.diag.j = aj;
12343fa6b06aSMark Adams     h_mat.diag.a = aa;
12353fa6b06aSMark Adams     // copy pointers and metdata to device
12363fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
12373fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
12383fa6b06aSMark Adams   } else {
12393fa6b06aSMark Adams     *B = *p_d_mat;
12403fa6b06aSMark Adams   }
12413fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
12423fa6b06aSMark Adams   PetscFunctionReturn(0);
12439db3cbf9SStefano Zampini #endif
12443fa6b06aSMark Adams }
1245