xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision ed502f03247b19485c139b6abc5093f1f2fe799c)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
107e8381f9SStefano Zampini 
117e8381f9SStefano Zampini struct VecCUDAEquals
127e8381f9SStefano Zampini {
137e8381f9SStefano Zampini   template <typename Tuple>
147e8381f9SStefano Zampini   __host__ __device__
157e8381f9SStefano Zampini   void operator()(Tuple t)
167e8381f9SStefano Zampini   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
227e8381f9SStefano Zampini {
237e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
247e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
257e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
267e8381f9SStefano Zampini   PetscErrorCode     ierr;
277e8381f9SStefano Zampini   cudaError_t        cerr;
287e8381f9SStefano Zampini 
297e8381f9SStefano Zampini   PetscFunctionBegin;
307e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
31e61fc153SStefano Zampini   if (cusp->coo_p && v) {
32e61fc153SStefano Zampini     THRUSTARRAY *w;
337e8381f9SStefano Zampini     ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
34e61fc153SStefano Zampini     w = new THRUSTARRAY(n);
35e61fc153SStefano Zampini     w->assign(v,v+n);
36e61fc153SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->begin()),
377e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
38e61fc153SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w->begin(),cusp->coo_p->end()),
397e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
407e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
41e61fc153SStefano Zampini     delete w;
427e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
437e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
447e8381f9SStefano Zampini   } else {
457e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
467e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
477e8381f9SStefano Zampini   }
487e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
497e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
507e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
517e8381f9SStefano Zampini   A->num_ass++;
527e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
537e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
547e8381f9SStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_BOTH;
557e8381f9SStefano Zampini   PetscFunctionReturn(0);
567e8381f9SStefano Zampini }
577e8381f9SStefano Zampini 
587e8381f9SStefano Zampini template <typename Tuple>
597e8381f9SStefano Zampini struct IsNotOffDiagT
607e8381f9SStefano Zampini {
617e8381f9SStefano Zampini   PetscInt _cstart,_cend;
627e8381f9SStefano Zampini 
637e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
647e8381f9SStefano Zampini   __host__ __device__
657e8381f9SStefano Zampini   inline bool operator()(Tuple t)
667e8381f9SStefano Zampini   {
677e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
687e8381f9SStefano Zampini   }
697e8381f9SStefano Zampini };
707e8381f9SStefano Zampini 
717e8381f9SStefano Zampini struct IsOffDiag
727e8381f9SStefano Zampini {
737e8381f9SStefano Zampini   PetscInt _cstart,_cend;
747e8381f9SStefano Zampini 
757e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
767e8381f9SStefano Zampini   __host__ __device__
777e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
787e8381f9SStefano Zampini   {
797e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
807e8381f9SStefano Zampini   }
817e8381f9SStefano Zampini };
827e8381f9SStefano Zampini 
837e8381f9SStefano Zampini struct GlobToLoc
847e8381f9SStefano Zampini {
857e8381f9SStefano Zampini   PetscInt _start;
867e8381f9SStefano Zampini 
877e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
887e8381f9SStefano Zampini   __host__ __device__
897e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
907e8381f9SStefano Zampini   {
917e8381f9SStefano Zampini     return c - _start;
927e8381f9SStefano Zampini   }
937e8381f9SStefano Zampini };
947e8381f9SStefano Zampini 
957e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
967e8381f9SStefano Zampini {
977e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
987e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
997e8381f9SStefano Zampini   PetscErrorCode         ierr;
1007e8381f9SStefano Zampini   PetscInt               *jj;
1017e8381f9SStefano Zampini   size_t                 noff = 0;
1027e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1037e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1047e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1057e8381f9SStefano Zampini   cudaError_t            cerr;
1067e8381f9SStefano Zampini 
1077e8381f9SStefano Zampini   PetscFunctionBegin;
1087e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1097e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1107e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1117e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1127e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1137e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1147e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1157e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1167e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1197e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1207e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1217e8381f9SStefano Zampini   delete cusp->coo_p;
1227e8381f9SStefano Zampini   delete cusp->coo_pw;
1237e8381f9SStefano Zampini   cusp->coo_p = NULL;
1247e8381f9SStefano Zampini   cusp->coo_pw = NULL;
1257e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1267e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1277e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1287e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1297e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1307e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1317e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1327e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1337e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1347e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1357e8381f9SStefano Zampini   }
1367e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1377e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1387e8381f9SStefano Zampini 
1397e8381f9SStefano Zampini   /* from global to local */
1407e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1417e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
1427e8381f9SStefano Zampini 
1437e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1447e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1457e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1467e8381f9SStefano Zampini   auto o_j = d_j.begin();
1477e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1487e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1497e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
1507e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1517e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1527e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1537e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1547e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1557e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1567e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1577e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1587e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1597e8381f9SStefano Zampini 
1607e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1617e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1627e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1637e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1667e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1677e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1687e8381f9SStefano Zampini 
1697e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1707e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1737e8381f9SStefano Zampini 
1747e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1827e8381f9SStefano Zampini   B->nonzerostate++;
1837e8381f9SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
1847e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1857e8381f9SStefano Zampini 
1867e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1887e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1897e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
1907e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
1917e8381f9SStefano Zampini   PetscFunctionReturn(0);
1927e8381f9SStefano Zampini }
1939ae82921SPaul Mullowney 
194*ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
195*ed502f03SStefano Zampini {
196*ed502f03SStefano Zampini   Mat            Ad,Ao;
197*ed502f03SStefano Zampini   const PetscInt *cmap;
198*ed502f03SStefano Zampini   PetscErrorCode ierr;
199*ed502f03SStefano Zampini 
200*ed502f03SStefano Zampini   PetscFunctionBegin;
201*ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
202*ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
203*ed502f03SStefano Zampini   if (glob) {
204*ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
205*ed502f03SStefano Zampini 
206*ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
207*ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
208*ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
209*ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
210*ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
211*ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
212*ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
213*ed502f03SStefano Zampini   }
214*ed502f03SStefano Zampini   PetscFunctionReturn(0);
215*ed502f03SStefano Zampini }
216*ed502f03SStefano Zampini 
2179ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2189ae82921SPaul Mullowney {
219bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
220bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2219ae82921SPaul Mullowney   PetscErrorCode     ierr;
2229ae82921SPaul Mullowney   PetscInt           i;
2239ae82921SPaul Mullowney 
2249ae82921SPaul Mullowney   PetscFunctionBegin;
2259ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2269ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2277e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2289ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2299ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2309ae82921SPaul Mullowney     }
2319ae82921SPaul Mullowney   }
2327e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2339ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2349ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2359ae82921SPaul Mullowney     }
2369ae82921SPaul Mullowney   }
2379ae82921SPaul Mullowney   if (!B->preallocated) {
238bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
2399ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
240b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2419ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2423bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2439ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
244b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2459ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
2463bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2479ae82921SPaul Mullowney   }
248d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
249d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
250d98d7c49SStefano Zampini   if (b->lvec) {
251d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
252d98d7c49SStefano Zampini   }
2539ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2549ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
255e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
256e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
257b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
258b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
259b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
260b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2612205254eSKarl Rupp 
2629ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2639ae82921SPaul Mullowney   PetscFunctionReturn(0);
2649ae82921SPaul Mullowney }
265e057df02SPaul Mullowney 
2669ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
2679ae82921SPaul Mullowney {
2689ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
2699ae82921SPaul Mullowney   PetscErrorCode ierr;
2709ae82921SPaul Mullowney   PetscInt       nt;
2719ae82921SPaul Mullowney 
2729ae82921SPaul Mullowney   PetscFunctionBegin;
2739ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
2749ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
2759ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2764d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
2779ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
2789ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
2799ae82921SPaul Mullowney   PetscFunctionReturn(0);
2809ae82921SPaul Mullowney }
281ca45077fSPaul Mullowney 
2823fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
2833fa6b06aSMark Adams {
2843fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
2853fa6b06aSMark Adams   PetscErrorCode ierr;
2863fa6b06aSMark Adams 
2873fa6b06aSMark Adams   PetscFunctionBegin;
2883fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
2893fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
2903fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
2913fa6b06aSMark Adams     if (d_mat) {
2923fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
2933fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
2943fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
2953fa6b06aSMark Adams       cudaError_t  err;
2963fa6b06aSMark Adams       PetscScalar  *vals;
2973fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
2983fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2993fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
3003fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
3013fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
3023fa6b06aSMark Adams     }
3033fa6b06aSMark Adams   }
3043fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
3053fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
3063fa6b06aSMark Adams 
3073fa6b06aSMark Adams   PetscFunctionReturn(0);
3083fa6b06aSMark Adams }
309fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
310fdc842d1SBarry Smith {
311fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
312fdc842d1SBarry Smith   PetscErrorCode ierr;
313fdc842d1SBarry Smith   PetscInt       nt;
314fdc842d1SBarry Smith 
315fdc842d1SBarry Smith   PetscFunctionBegin;
316fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
317fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
318fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
3194d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
320fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
321fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
322fdc842d1SBarry Smith   PetscFunctionReturn(0);
323fdc842d1SBarry Smith }
324fdc842d1SBarry Smith 
325ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
326ca45077fSPaul Mullowney {
327ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
328ca45077fSPaul Mullowney   PetscErrorCode ierr;
329ca45077fSPaul Mullowney   PetscInt       nt;
330ca45077fSPaul Mullowney 
331ca45077fSPaul Mullowney   PetscFunctionBegin;
332ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
333ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
3349b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
335ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
3369b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
3379b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
338ca45077fSPaul Mullowney   PetscFunctionReturn(0);
339ca45077fSPaul Mullowney }
3409ae82921SPaul Mullowney 
341e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
342ca45077fSPaul Mullowney {
343ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
344bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
345e057df02SPaul Mullowney 
346ca45077fSPaul Mullowney   PetscFunctionBegin;
347e057df02SPaul Mullowney   switch (op) {
348e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
349e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
350045c96e1SPaul Mullowney     break;
351e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
352e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
353045c96e1SPaul Mullowney     break;
354e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
355e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
356e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
357045c96e1SPaul Mullowney     break;
358e057df02SPaul Mullowney   default:
359e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
360045c96e1SPaul Mullowney   }
361ca45077fSPaul Mullowney   PetscFunctionReturn(0);
362ca45077fSPaul Mullowney }
363e057df02SPaul Mullowney 
3644416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
365e057df02SPaul Mullowney {
366e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
367e057df02SPaul Mullowney   PetscErrorCode           ierr;
368e057df02SPaul Mullowney   PetscBool                flg;
369a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
370a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
3715fd66863SKarl Rupp 
372e057df02SPaul Mullowney   PetscFunctionBegin;
373e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
374e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
375e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
376a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
377e057df02SPaul Mullowney     if (flg) {
378e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
379e057df02SPaul Mullowney     }
380e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
381a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
382e057df02SPaul Mullowney     if (flg) {
383e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
384e057df02SPaul Mullowney     }
385e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
386a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
387e057df02SPaul Mullowney     if (flg) {
388e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
389e057df02SPaul Mullowney     }
390e057df02SPaul Mullowney   }
3910af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
392e057df02SPaul Mullowney   PetscFunctionReturn(0);
393e057df02SPaul Mullowney }
394e057df02SPaul Mullowney 
39534d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
39634d6c7a5SJose E. Roman {
39734d6c7a5SJose E. Roman   PetscErrorCode             ierr;
3983fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
3993fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
4003fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
40134d6c7a5SJose E. Roman   PetscFunctionBegin;
40234d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
40334d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
40434d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
40534d6c7a5SJose E. Roman   }
406a587d139SMark   if (d_mat) {
4073fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
4083fa6b06aSMark Adams   }
4093fa6b06aSMark Adams 
41034d6c7a5SJose E. Roman   PetscFunctionReturn(0);
41134d6c7a5SJose E. Roman }
41234d6c7a5SJose E. Roman 
413bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
414bbf3fe20SPaul Mullowney {
415bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
4163fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
4173fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
418b06137fdSPaul Mullowney   cudaError_t        err;
419b06137fdSPaul Mullowney   cusparseStatus_t   stat;
420bbf3fe20SPaul Mullowney 
421bbf3fe20SPaul Mullowney   PetscFunctionBegin;
422d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
4233fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
4243fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
4253fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
4263fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
4273fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
4283fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
4293fa6b06aSMark Adams     if (jaca->compressedrow.use) {
4303fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
4313fa6b06aSMark Adams     }
4323fa6b06aSMark Adams     if (jacb->compressedrow.use) {
4333fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
4343fa6b06aSMark Adams     }
4353fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
4363fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
4373fa6b06aSMark Adams   }
438bbf3fe20SPaul Mullowney   try {
4393fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
4403fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
44157d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
44217403302SKarl Rupp     if (cusparseStruct->stream) {
443c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
44417403302SKarl Rupp     }
4457e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
4467e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
447bbf3fe20SPaul Mullowney     delete cusparseStruct;
448bbf3fe20SPaul Mullowney   } catch(char *ex) {
449bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
450bbf3fe20SPaul Mullowney   }
4513338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
452*ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
4537e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
4547e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
4553338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
456bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
457bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
458bbf3fe20SPaul Mullowney }
459ca45077fSPaul Mullowney 
4603338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
4619ae82921SPaul Mullowney {
4629ae82921SPaul Mullowney   PetscErrorCode     ierr;
463bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
464bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
465b06137fdSPaul Mullowney   cusparseStatus_t   stat;
4663338378cSStefano Zampini   Mat                A;
4679ae82921SPaul Mullowney 
4689ae82921SPaul Mullowney   PetscFunctionBegin;
4693338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
4703338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
4713338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
4723338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
4733338378cSStefano Zampini   }
4743338378cSStefano Zampini   A = *newmat;
4753338378cSStefano Zampini 
47634136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
47734136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
47834136279SStefano Zampini 
479bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
480d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
481d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
482d98d7c49SStefano Zampini   if (a->lvec) {
483d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
484d98d7c49SStefano Zampini   }
485d98d7c49SStefano Zampini 
4863338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
487bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
4882205254eSKarl Rupp 
489bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
490e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
491e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
4927e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
4937e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
49417403302SKarl Rupp     cusparseStruct->stream              = 0;
49557d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
4963fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
4973338378cSStefano Zampini   }
4982205254eSKarl Rupp 
49934d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
500bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
501fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
502bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
503bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
504bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
5053fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
5062205254eSKarl Rupp 
507bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
508*ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
5093338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
510bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
5117e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5127e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
5139ae82921SPaul Mullowney   PetscFunctionReturn(0);
5149ae82921SPaul Mullowney }
5159ae82921SPaul Mullowney 
5163338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
5173338378cSStefano Zampini {
5183338378cSStefano Zampini   PetscErrorCode ierr;
5193338378cSStefano Zampini 
5203338378cSStefano Zampini   PetscFunctionBegin;
52105035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
5223338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
5233338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
5243338378cSStefano Zampini   PetscFunctionReturn(0);
5253338378cSStefano Zampini }
5263338378cSStefano Zampini 
5273f9c0db1SPaul Mullowney /*@
5283f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
529e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
5303f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
531e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
532e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
533e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
5349ae82921SPaul Mullowney 
535d083f849SBarry Smith    Collective
536e057df02SPaul Mullowney 
537e057df02SPaul Mullowney    Input Parameters:
538e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
539e057df02SPaul Mullowney .  m - number of rows
540e057df02SPaul Mullowney .  n - number of columns
541e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
542e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
5430298fd71SBarry Smith          (possibly different for each row) or NULL
544e057df02SPaul Mullowney 
545e057df02SPaul Mullowney    Output Parameter:
546e057df02SPaul Mullowney .  A - the matrix
547e057df02SPaul Mullowney 
548e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
549e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
550e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
551e057df02SPaul Mullowney 
552e057df02SPaul Mullowney    Notes:
553e057df02SPaul Mullowney    If nnz is given then nz is ignored
554e057df02SPaul Mullowney 
555e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
556e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
557e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
558e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
559e057df02SPaul Mullowney 
560e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
5610298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
562e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
563e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
564e057df02SPaul Mullowney 
565e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
566e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
567e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
568e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
569e057df02SPaul Mullowney 
570e057df02SPaul Mullowney    Level: intermediate
571e057df02SPaul Mullowney 
572e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
573e057df02SPaul Mullowney @*/
5749ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
5759ae82921SPaul Mullowney {
5769ae82921SPaul Mullowney   PetscErrorCode ierr;
5779ae82921SPaul Mullowney   PetscMPIInt    size;
5789ae82921SPaul Mullowney 
5799ae82921SPaul Mullowney   PetscFunctionBegin;
5809ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
5819ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
582ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
5839ae82921SPaul Mullowney   if (size > 1) {
5849ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
5859ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
5869ae82921SPaul Mullowney   } else {
5879ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
5889ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
5899ae82921SPaul Mullowney   }
5909ae82921SPaul Mullowney   PetscFunctionReturn(0);
5919ae82921SPaul Mullowney }
5929ae82921SPaul Mullowney 
5933ca39a21SBarry Smith /*MC
594e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
595e057df02SPaul Mullowney 
5962692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
5972692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
5982692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
5999ae82921SPaul Mullowney 
6009ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
6019ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
6029ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
6039ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
6049ae82921SPaul Mullowney    the above preallocation routines for simplicity.
6059ae82921SPaul Mullowney 
6069ae82921SPaul Mullowney    Options Database Keys:
607e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
6088468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6098468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6108468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
6119ae82921SPaul Mullowney 
6129ae82921SPaul Mullowney   Level: beginner
6139ae82921SPaul Mullowney 
6148468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
6158468deeeSKarl Rupp M
6169ae82921SPaul Mullowney M*/
6173fa6b06aSMark Adams 
6183fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
6193fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
6203fa6b06aSMark Adams {
6219db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
6229db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
6239db3cbf9SStefano Zampini #else
6243fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
6253fa6b06aSMark Adams   PetscMPIInt                size,rank;
6263fa6b06aSMark Adams   MPI_Comm                   comm;
6273fa6b06aSMark Adams   PetscErrorCode             ierr;
6283fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
6293fa6b06aSMark Adams   PetscScalar                *aa,*ba;
6303fa6b06aSMark Adams 
6313fa6b06aSMark Adams   PetscFunctionBegin;
6323fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
633ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
634ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
6353fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
6363fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
6373fa6b06aSMark Adams     if (size == 1) {
6383fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
6393fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
6403fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
6413fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
6423fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
6433fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
6446b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
6453fa6b06aSMark Adams     } else {
6463fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
6473fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
6483fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
6493fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
6503fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
6513fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
6523fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
6533fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
6543fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
6553fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
6563fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
6573fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
6583fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
6593fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
6606b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
6613fa6b06aSMark Adams     }
6623fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
6633fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
6643fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
6653fa6b06aSMark Adams   } else {
6663fa6b06aSMark Adams     *B = NULL;
6673fa6b06aSMark Adams     PetscFunctionReturn(0);
6683fa6b06aSMark Adams   }
6693fa6b06aSMark Adams   // act like MatSetValues because not called on host
6703fa6b06aSMark Adams   if (A->assembled) {
6713fa6b06aSMark Adams     if (A->was_assembled) {
6723fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
6733fa6b06aSMark Adams     }
6743fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
6753fa6b06aSMark Adams   } else {
6763fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
6773fa6b06aSMark Adams   }
6783fa6b06aSMark Adams   if (!*p_d_mat) {
6793fa6b06aSMark Adams     cudaError_t                 err;
6803fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
6813fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
6823fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
6833fa6b06aSMark Adams     // create and copy
6843fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
6853fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
6863fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
6873fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
6883fa6b06aSMark Adams     if (size == 1) {
6893fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
6903fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
6913fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
6923fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
6933fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
6943fa6b06aSMark Adams     } else {
6953fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
6963fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
6973fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
6983fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
6993fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
7003fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
7013fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
7023fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
7033fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
7043fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
7053fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
7063fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
7073fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
7083fa6b06aSMark Adams       {
7093fa6b06aSMark Adams         PetscInt ii;
7103fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
7113fa6b06aSMark Adams       }
7123fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
7133fa6b06aSMark Adams       // allocate B copy data
7143fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
7153fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
7163fa6b06aSMark Adams       nnz = jacb->i[n];
7173fa6b06aSMark Adams 
7183fa6b06aSMark Adams       if (jacb->compressedrow.use) {
7193fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7203fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7216b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
7223fa6b06aSMark Adams       h_mat.offdiag.j = bj;
7233fa6b06aSMark Adams       h_mat.offdiag.a = ba;
7243fa6b06aSMark Adams 
7253fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
7263fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7273fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
7283fa6b06aSMark Adams       h_mat.offdiag.n = n;
7293fa6b06aSMark Adams     }
7303fa6b06aSMark Adams     // allocate A copy data
7313fa6b06aSMark Adams     nnz = jaca->i[n];
7323fa6b06aSMark Adams     h_mat.diag.n = n;
7333fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
734ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
7353fa6b06aSMark Adams     if (jaca->compressedrow.use) {
7363fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
7373fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7383fa6b06aSMark Adams     } else {
7393fa6b06aSMark Adams       h_mat.diag.i = ai;
7403fa6b06aSMark Adams     }
7413fa6b06aSMark Adams     h_mat.diag.j = aj;
7423fa6b06aSMark Adams     h_mat.diag.a = aa;
7433fa6b06aSMark Adams     // copy pointers and metdata to device
7443fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
7453fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
7463fa6b06aSMark Adams   } else {
7473fa6b06aSMark Adams     *B = *p_d_mat;
7483fa6b06aSMark Adams   }
7493fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
7503fa6b06aSMark Adams   PetscFunctionReturn(0);
7519db3cbf9SStefano Zampini #endif
7523fa6b06aSMark Adams }
753