xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision baf1f5d704352670a780f028149c0bc89e6b2035)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
257e8381f9SStefano Zampini {
267e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
277e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
287e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
297e8381f9SStefano Zampini   PetscErrorCode     ierr;
307e8381f9SStefano Zampini 
317e8381f9SStefano Zampini   PetscFunctionBegin;
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
53219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
56219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
607e8381f9SStefano Zampini   A->num_ass++;
617e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
627e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
634eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
647e8381f9SStefano Zampini   PetscFunctionReturn(0);
657e8381f9SStefano Zampini }
667e8381f9SStefano Zampini 
677e8381f9SStefano Zampini template <typename Tuple>
687e8381f9SStefano Zampini struct IsNotOffDiagT
697e8381f9SStefano Zampini {
707e8381f9SStefano Zampini   PetscInt _cstart,_cend;
717e8381f9SStefano Zampini 
727e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
737e8381f9SStefano Zampini   __host__ __device__
747e8381f9SStefano Zampini   inline bool operator()(Tuple t)
757e8381f9SStefano Zampini   {
767e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
777e8381f9SStefano Zampini   }
787e8381f9SStefano Zampini };
797e8381f9SStefano Zampini 
807e8381f9SStefano Zampini struct IsOffDiag
817e8381f9SStefano Zampini {
827e8381f9SStefano Zampini   PetscInt _cstart,_cend;
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
857e8381f9SStefano Zampini   __host__ __device__
867e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
877e8381f9SStefano Zampini   {
887e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
927e8381f9SStefano Zampini struct GlobToLoc
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _start;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return c - _start;
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
104219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1077e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1087e8381f9SStefano Zampini   PetscErrorCode         ierr;
10982a78a4eSJed Brown   PetscInt               N,*jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
111ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini   cudaError_t            cerr;
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini   PetscFunctionBegin;
1177e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1187e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1197e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1237e8381f9SStefano Zampini 
1247e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1257e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1267e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1277e8381f9SStefano Zampini   delete cusp->coo_p;
1287e8381f9SStefano Zampini   delete cusp->coo_pw;
1297e8381f9SStefano Zampini   cusp->coo_p = NULL;
1307e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1327e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1337e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1347e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1357e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1367e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1377e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1387e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1397e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1407e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1417e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1427e8381f9SStefano Zampini   }
1437e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1447e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1457e8381f9SStefano Zampini 
1467e8381f9SStefano Zampini   /* from global to local */
1477e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1487e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
14908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
152ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1537e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1547e8381f9SStefano Zampini   auto o_j = d_j.begin();
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1577e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
158ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
15908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1607e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
161ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1627e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1637e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
16682a78a4eSJed Brown   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
1672c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1687e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1697e8381f9SStefano Zampini 
1707e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1787e8381f9SStefano Zampini 
1797e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
180219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1837e8381f9SStefano Zampini 
1847e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188a0e72f99SJunchao Zhang   /*
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1907e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191a0e72f99SJunchao Zhang   */
1927e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1947e8381f9SStefano Zampini   B->nonzerostate++;
1957e8381f9SStefano Zampini 
1967e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1997e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2007e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2017e8381f9SStefano Zampini   PetscFunctionReturn(0);
2027e8381f9SStefano Zampini }
2039ae82921SPaul Mullowney 
204219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205219fbbafSJunchao Zhang {
206219fbbafSJunchao Zhang   PetscErrorCode         ierr;
207219fbbafSJunchao Zhang   Mat                    newmat;
208219fbbafSJunchao Zhang   Mat_MPIAIJ             *mpiaij;
209219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE     *mpidev;
210219fbbafSJunchao Zhang   PetscInt               coo_basic = 1;
211219fbbafSJunchao Zhang   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212219fbbafSJunchao Zhang   PetscInt               rstart,rend;
213219fbbafSJunchao Zhang   cudaError_t            cerr;
214219fbbafSJunchao Zhang 
215219fbbafSJunchao Zhang   PetscFunctionBegin;
216219fbbafSJunchao Zhang   if (coo_i) {
217219fbbafSJunchao Zhang     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218219fbbafSJunchao Zhang     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
220219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221219fbbafSJunchao Zhang         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222219fbbafSJunchao Zhang       }
223219fbbafSJunchao Zhang     }
224219fbbafSJunchao Zhang   }
225219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
226219fbbafSJunchao Zhang   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227219fbbafSJunchao Zhang 
228219fbbafSJunchao Zhang   if (coo_basic) {
229219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230219fbbafSJunchao Zhang   } else {
231219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232219fbbafSJunchao Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233219fbbafSJunchao Zhang     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234219fbbafSJunchao Zhang     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238219fbbafSJunchao Zhang     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239219fbbafSJunchao Zhang     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
241219fbbafSJunchao Zhang     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
242*baf1f5d7SJed Brown     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr);
243*baf1f5d7SJed Brown     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr);
244*baf1f5d7SJed Brown     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
245219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
246219fbbafSJunchao Zhang 
247219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
248219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
249219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
250219fbbafSJunchao Zhang 
251219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
252219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
253219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
254219fbbafSJunchao Zhang 
255219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
256219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
257219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
258219fbbafSJunchao Zhang 
259219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
260219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
261219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
262219fbbafSJunchao Zhang 
263219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
264219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
265219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
266219fbbafSJunchao Zhang 
267219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
269219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270219fbbafSJunchao Zhang 
271219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
273219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274219fbbafSJunchao Zhang 
275219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
277219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278219fbbafSJunchao Zhang 
279219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
281219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282219fbbafSJunchao Zhang 
283219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
284219fbbafSJunchao Zhang   }
285219fbbafSJunchao Zhang   PetscFunctionReturn(0);
286219fbbafSJunchao Zhang }
287219fbbafSJunchao Zhang 
288219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
289219fbbafSJunchao Zhang {
290219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
291219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
292219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
293219fbbafSJunchao Zhang }
294219fbbafSJunchao Zhang 
295219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
296219fbbafSJunchao Zhang {
297219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
298219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
299219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
300219fbbafSJunchao Zhang }
301219fbbafSJunchao Zhang 
302219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
303219fbbafSJunchao Zhang {
304219fbbafSJunchao Zhang   PetscErrorCode                 ierr;
305219fbbafSJunchao Zhang   cudaError_t                    cerr;
306219fbbafSJunchao Zhang   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
307219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
308219fbbafSJunchao Zhang   Mat                            A = mpiaij->A,B = mpiaij->B;
309219fbbafSJunchao Zhang   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
310219fbbafSJunchao Zhang   PetscScalar                    *Aa,*Ba;
311219fbbafSJunchao Zhang   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
312219fbbafSJunchao Zhang   const PetscScalar              *v1 = v;
313219fbbafSJunchao Zhang   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
314219fbbafSJunchao Zhang   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
315219fbbafSJunchao Zhang   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
316219fbbafSJunchao Zhang   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
317219fbbafSJunchao Zhang   PetscMemType                   memtype;
318219fbbafSJunchao Zhang 
319219fbbafSJunchao Zhang   PetscFunctionBegin;
320219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
321219fbbafSJunchao Zhang     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
322219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
323219fbbafSJunchao Zhang       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
3247487cd7cSJunchao Zhang       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
325219fbbafSJunchao Zhang     }
326219fbbafSJunchao Zhang 
327219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
328219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
329219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
3307487cd7cSJunchao Zhang       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
3317487cd7cSJunchao Zhang       cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
332219fbbafSJunchao Zhang     } else {
333219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
334219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
335219fbbafSJunchao Zhang     }
336219fbbafSJunchao Zhang 
337219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
338219fbbafSJunchao Zhang     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
339219fbbafSJunchao Zhang 
340219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
341219fbbafSJunchao Zhang     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
342219fbbafSJunchao Zhang     /* Add local entries to A and B */
343219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
344219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
345219fbbafSJunchao Zhang     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
346219fbbafSJunchao Zhang 
347219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
348219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
349219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
350219fbbafSJunchao Zhang 
351219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
352219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
353219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
354219fbbafSJunchao Zhang     } else {
355219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
356219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
357219fbbafSJunchao Zhang     }
358219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
359219fbbafSJunchao Zhang   } else {
360219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
361219fbbafSJunchao Zhang   }
362219fbbafSJunchao Zhang   PetscFunctionReturn(0);
363219fbbafSJunchao Zhang }
364219fbbafSJunchao Zhang 
365ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
366ed502f03SStefano Zampini {
367ed502f03SStefano Zampini   Mat            Ad,Ao;
368ed502f03SStefano Zampini   const PetscInt *cmap;
369ed502f03SStefano Zampini   PetscErrorCode ierr;
370ed502f03SStefano Zampini 
371ed502f03SStefano Zampini   PetscFunctionBegin;
372ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
373ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
374ed502f03SStefano Zampini   if (glob) {
375ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
376ed502f03SStefano Zampini 
377ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
378ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
379ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
380ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
381ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
382ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
383ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
384ed502f03SStefano Zampini   }
385ed502f03SStefano Zampini   PetscFunctionReturn(0);
386ed502f03SStefano Zampini }
387ed502f03SStefano Zampini 
3889ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
3899ae82921SPaul Mullowney {
390bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
391bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
3929ae82921SPaul Mullowney   PetscErrorCode     ierr;
3939ae82921SPaul Mullowney   PetscInt           i;
3949ae82921SPaul Mullowney 
3959ae82921SPaul Mullowney   PetscFunctionBegin;
3969ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
3979ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
3987e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
3999ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4002c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4019ae82921SPaul Mullowney     }
4029ae82921SPaul Mullowney   }
4037e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4049ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4052c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4069ae82921SPaul Mullowney     }
4079ae82921SPaul Mullowney   }
4086a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4096a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
4106a29ce69SStefano Zampini #else
4116a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
4126a29ce69SStefano Zampini #endif
4136a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
4146a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
4156a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
4166a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4176a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
4186a29ce69SStefano Zampini   if (!b->A) {
4199ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
4209ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
4213bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
4226a29ce69SStefano Zampini   }
4236a29ce69SStefano Zampini   if (!b->B) {
4246a29ce69SStefano Zampini     PetscMPIInt size;
42555b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
4269ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
4276a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
4283bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
4299ae82921SPaul Mullowney   }
430d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
431d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4326a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
4336a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
4349ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
4359ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
436e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
437e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
438b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
439b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
440a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
441b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
442b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
443a0e72f99SJunchao Zhang   */
4449ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4459ae82921SPaul Mullowney   PetscFunctionReturn(0);
4469ae82921SPaul Mullowney }
447e057df02SPaul Mullowney 
4489ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4499ae82921SPaul Mullowney {
4509ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
4519ae82921SPaul Mullowney   PetscErrorCode ierr;
4529ae82921SPaul Mullowney   PetscInt       nt;
4539ae82921SPaul Mullowney 
4549ae82921SPaul Mullowney   PetscFunctionBegin;
4559ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4562c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
45756258e06SRichard Tran Mills   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
45856258e06SRichard Tran Mills   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
4599ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4604d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
4619ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4629ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
4639ae82921SPaul Mullowney   PetscFunctionReturn(0);
4649ae82921SPaul Mullowney }
465ca45077fSPaul Mullowney 
4663fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4673fa6b06aSMark Adams {
4683fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4693fa6b06aSMark Adams   PetscErrorCode ierr;
4703fa6b06aSMark Adams 
4713fa6b06aSMark Adams   PetscFunctionBegin;
4723fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
4733fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
4743fa6b06aSMark Adams   PetscFunctionReturn(0);
4753fa6b06aSMark Adams }
476042217e8SBarry Smith 
477fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
478fdc842d1SBarry Smith {
479fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
480fdc842d1SBarry Smith   PetscErrorCode ierr;
481fdc842d1SBarry Smith   PetscInt       nt;
482fdc842d1SBarry Smith 
483fdc842d1SBarry Smith   PetscFunctionBegin;
484fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4852c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
486fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4874d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
488fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
489fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
490fdc842d1SBarry Smith   PetscFunctionReturn(0);
491fdc842d1SBarry Smith }
492fdc842d1SBarry Smith 
493ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
494ca45077fSPaul Mullowney {
495ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
496ca45077fSPaul Mullowney   PetscErrorCode ierr;
497ca45077fSPaul Mullowney   PetscInt       nt;
498ca45077fSPaul Mullowney 
499ca45077fSPaul Mullowney   PetscFunctionBegin;
500ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
5012c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
5029b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
503ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
5049b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
5059b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
506ca45077fSPaul Mullowney   PetscFunctionReturn(0);
507ca45077fSPaul Mullowney }
5089ae82921SPaul Mullowney 
509e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
510ca45077fSPaul Mullowney {
511ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
512bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
513e057df02SPaul Mullowney 
514ca45077fSPaul Mullowney   PetscFunctionBegin;
515e057df02SPaul Mullowney   switch (op) {
516e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
517e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
518045c96e1SPaul Mullowney     break;
519e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
520e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
521045c96e1SPaul Mullowney     break;
522e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
523e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
524e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
525045c96e1SPaul Mullowney     break;
526e057df02SPaul Mullowney   default:
52798921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
528045c96e1SPaul Mullowney   }
529ca45077fSPaul Mullowney   PetscFunctionReturn(0);
530ca45077fSPaul Mullowney }
531e057df02SPaul Mullowney 
5324416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
533e057df02SPaul Mullowney {
534e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
535e057df02SPaul Mullowney   PetscErrorCode           ierr;
536e057df02SPaul Mullowney   PetscBool                flg;
537a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
538a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5395fd66863SKarl Rupp 
540e057df02SPaul Mullowney   PetscFunctionBegin;
541e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
542e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
543e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
544a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
545e057df02SPaul Mullowney     if (flg) {
546e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
547e057df02SPaul Mullowney     }
548e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
549a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
550e057df02SPaul Mullowney     if (flg) {
551e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
552e057df02SPaul Mullowney     }
553e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
554a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
555e057df02SPaul Mullowney     if (flg) {
556e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
557e057df02SPaul Mullowney     }
558e057df02SPaul Mullowney   }
5590af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
560e057df02SPaul Mullowney   PetscFunctionReturn(0);
561e057df02SPaul Mullowney }
562e057df02SPaul Mullowney 
56334d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
56434d6c7a5SJose E. Roman {
56534d6c7a5SJose E. Roman   PetscErrorCode     ierr;
5663fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
567042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
568042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
569042217e8SBarry Smith 
57034d6c7a5SJose E. Roman   PetscFunctionBegin;
57134d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
572042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
573042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
574042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
575042217e8SBarry Smith     cudaError_t                cerr;
576042217e8SBarry Smith 
577042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
578042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
579042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
580042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
58199551766SMark Adams     if (h_mat->allocated_indices) {
58299551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
58399551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
58499551766SMark Adams       if (h_mat->offdiag.j) {
58599551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
58699551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
58799551766SMark Adams       }
58899551766SMark Adams     }
589042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
590042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
591042217e8SBarry Smith     cusp->deviceMat = NULL;
5923fa6b06aSMark Adams   }
59334d6c7a5SJose E. Roman   PetscFunctionReturn(0);
59434d6c7a5SJose E. Roman }
59534d6c7a5SJose E. Roman 
596bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
597bbf3fe20SPaul Mullowney {
598bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
599219fbbafSJunchao Zhang   cudaError_t        cerr;
6003fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
6013fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
602b06137fdSPaul Mullowney   cusparseStatus_t   stat;
603bbf3fe20SPaul Mullowney 
604bbf3fe20SPaul Mullowney   PetscFunctionBegin;
6052c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6063fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
607042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
608042217e8SBarry Smith 
6093fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
610042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
611042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
612042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
61399551766SMark Adams     if (h_mat->allocated_indices) {
61499551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
61599551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
61699551766SMark Adams       if (h_mat->offdiag.j) {
61799551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
61899551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
61999551766SMark Adams       }
62099551766SMark Adams     }
621042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
622042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
6233fa6b06aSMark Adams   }
624bbf3fe20SPaul Mullowney   try {
6253fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
6263fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
62757d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
628a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
62917403302SKarl Rupp     if (cusparseStruct->stream) {
630042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
63117403302SKarl Rupp     }
632a0e72f99SJunchao Zhang     */
633219fbbafSJunchao Zhang     /* Extended COO */
634219fbbafSJunchao Zhang     if (cusparseStruct->use_extended_coo) {
635219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
636219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
637219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
638219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
639219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
640219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
641219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
642219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
643219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
644219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
645219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
646219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
647219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
648219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
649219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
650219fbbafSJunchao Zhang     } else {
6517e8381f9SStefano Zampini       delete cusparseStruct->coo_p;
6527e8381f9SStefano Zampini       delete cusparseStruct->coo_pw;
653219fbbafSJunchao Zhang     }
654bbf3fe20SPaul Mullowney     delete cusparseStruct;
655bbf3fe20SPaul Mullowney   } catch(char *ex) {
65698921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
657bbf3fe20SPaul Mullowney   }
6583338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
659ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
6607e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
6617e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
6623338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
663ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
664bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
665bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
666bbf3fe20SPaul Mullowney }
667ca45077fSPaul Mullowney 
6683338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6699ae82921SPaul Mullowney {
6709ae82921SPaul Mullowney   PetscErrorCode     ierr;
671bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
672b06137fdSPaul Mullowney   cusparseStatus_t   stat;
6733338378cSStefano Zampini   Mat                A;
6749ae82921SPaul Mullowney 
6759ae82921SPaul Mullowney   PetscFunctionBegin;
676219fbbafSJunchao Zhang   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
6773338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
6783338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
6793338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
6803338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
6813338378cSStefano Zampini   }
6823338378cSStefano Zampini   A = *newmat;
6836f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
68434136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
68534136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
68634136279SStefano Zampini 
687bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
688d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
689d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
690d98d7c49SStefano Zampini   if (a->lvec) {
691d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
692d98d7c49SStefano Zampini   }
693d98d7c49SStefano Zampini 
6943338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
695219fbbafSJunchao Zhang     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
696219fbbafSJunchao Zhang     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
697219fbbafSJunchao Zhang     a->spptr = cusp;
6983338378cSStefano Zampini   }
6992205254eSKarl Rupp 
70034d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
701bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
702fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
703bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
704bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
705bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
7063fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
7074e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
7082205254eSKarl Rupp 
709bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
710ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
7113338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
712bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
7137e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
7147e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
715ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
716ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
717ae48a8d0SStefano Zampini #endif
7189ae82921SPaul Mullowney   PetscFunctionReturn(0);
7199ae82921SPaul Mullowney }
7209ae82921SPaul Mullowney 
7213338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
7223338378cSStefano Zampini {
7233338378cSStefano Zampini   PetscErrorCode ierr;
7243338378cSStefano Zampini 
7253338378cSStefano Zampini   PetscFunctionBegin;
726a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
7273338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
7283338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
7293338378cSStefano Zampini   PetscFunctionReturn(0);
7303338378cSStefano Zampini }
7313338378cSStefano Zampini 
7323f9c0db1SPaul Mullowney /*@
7333f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
734e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
7353f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
736e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
737e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
738e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7399ae82921SPaul Mullowney 
740d083f849SBarry Smith    Collective
741e057df02SPaul Mullowney 
742e057df02SPaul Mullowney    Input Parameters:
743e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
744e057df02SPaul Mullowney .  m - number of rows
745e057df02SPaul Mullowney .  n - number of columns
746e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
747e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7480298fd71SBarry Smith          (possibly different for each row) or NULL
749e057df02SPaul Mullowney 
750e057df02SPaul Mullowney    Output Parameter:
751e057df02SPaul Mullowney .  A - the matrix
752e057df02SPaul Mullowney 
753e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
754e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
755e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
756e057df02SPaul Mullowney 
757e057df02SPaul Mullowney    Notes:
758e057df02SPaul Mullowney    If nnz is given then nz is ignored
759e057df02SPaul Mullowney 
760e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
761e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
762e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
763e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
764e057df02SPaul Mullowney 
765e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7660298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
767e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
768e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
769e057df02SPaul Mullowney 
770e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
771e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
772e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
773e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
774e057df02SPaul Mullowney 
775e057df02SPaul Mullowney    Level: intermediate
776e057df02SPaul Mullowney 
777e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
778e057df02SPaul Mullowney @*/
7799ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7809ae82921SPaul Mullowney {
7819ae82921SPaul Mullowney   PetscErrorCode ierr;
7829ae82921SPaul Mullowney   PetscMPIInt    size;
7839ae82921SPaul Mullowney 
7849ae82921SPaul Mullowney   PetscFunctionBegin;
7859ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7869ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
787ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
7889ae82921SPaul Mullowney   if (size > 1) {
7899ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
7909ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
7919ae82921SPaul Mullowney   } else {
7929ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7939ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
7949ae82921SPaul Mullowney   }
7959ae82921SPaul Mullowney   PetscFunctionReturn(0);
7969ae82921SPaul Mullowney }
7979ae82921SPaul Mullowney 
7983ca39a21SBarry Smith /*MC
7996bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
800e057df02SPaul Mullowney 
8012692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
8022692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
8032692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
8049ae82921SPaul Mullowney 
8059ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
8069ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
8079ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
8089ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
8099ae82921SPaul Mullowney    the above preallocation routines for simplicity.
8109ae82921SPaul Mullowney 
8119ae82921SPaul Mullowney    Options Database Keys:
812e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
8138468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8148468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8158468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8169ae82921SPaul Mullowney 
8179ae82921SPaul Mullowney   Level: beginner
8189ae82921SPaul Mullowney 
8196bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
8206bb69460SJunchao Zhang M*/
8216bb69460SJunchao Zhang 
8226bb69460SJunchao Zhang /*MC
8236bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
8246bb69460SJunchao Zhang 
8256bb69460SJunchao Zhang   Level: beginner
8266bb69460SJunchao Zhang 
8276bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
8289ae82921SPaul Mullowney M*/
8293fa6b06aSMark Adams 
830042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
831042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
8323fa6b06aSMark Adams {
833042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
834042217e8SBarry Smith   PetscMPIInt                size;
8353fa6b06aSMark Adams   PetscErrorCode             ierr;
836042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
837042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
83899551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
839042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
840042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
8413fa6b06aSMark Adams 
8423fa6b06aSMark Adams   PetscFunctionBegin;
8432c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
844042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8453fa6b06aSMark Adams     *B = NULL;
8463fa6b06aSMark Adams     PetscFunctionReturn(0);
8473fa6b06aSMark Adams   }
848042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
84999551766SMark Adams   // get jaca
8503fa6b06aSMark Adams   if (size == 1) {
851042217e8SBarry Smith     PetscBool isseqaij;
852042217e8SBarry Smith 
853042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
854042217e8SBarry Smith     if (isseqaij) {
8553fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
8562c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
857042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
858042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
859042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8603fa6b06aSMark Adams     } else {
8613fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8622c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
863042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8643fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
865042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
866042217e8SBarry Smith       d_mat = spptr->deviceMat;
867042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
8683fa6b06aSMark Adams     }
869042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
870042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
8712c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
872042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
873042217e8SBarry Smith       bi = NULL;
874042217e8SBarry Smith       bj = NULL;
875042217e8SBarry Smith       ba = NULL;
876042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
877042217e8SBarry Smith   } else {
878042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8792c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
880042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
88199551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
882042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8833fa6b06aSMark Adams 
8842c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
885042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
886042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
8872c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
888042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
889042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
890042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
891042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
892042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
8932c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
8942c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
895042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
896042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8973fa6b06aSMark Adams       if (jacb->compressedrow.use) {
898042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
899042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
900042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
9013fa6b06aSMark Adams         }
902042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
903042217e8SBarry Smith       } else {
904042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
905042217e8SBarry Smith       }
906042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
907042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
908042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
909042217e8SBarry Smith     d_mat = spptr->deviceMat;
910042217e8SBarry Smith   }
9113fa6b06aSMark Adams   if (jaca->compressedrow.use) {
912042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
913042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
914042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
9153fa6b06aSMark Adams     }
916042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
9173fa6b06aSMark Adams   } else {
918042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
9193fa6b06aSMark Adams   }
920042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
921042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
922042217e8SBarry Smith 
923042217e8SBarry Smith   if (!d_mat) {
924042217e8SBarry Smith     cudaError_t                cerr;
925042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
926042217e8SBarry Smith 
927042217e8SBarry Smith     // create and populate strucy on host and copy on device
928042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
929042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
930042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
931042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
932042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
93399551766SMark Adams       PetscInt   *colmap;
934042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
935042217e8SBarry Smith 
9362c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
937042217e8SBarry Smith 
938042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
939042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
940365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
941365b711fSMark Adams       { // have to make a long version of these
94299551766SMark Adams         int        *h_bi32, *h_bj32;
94399551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
944365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
94599551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94699551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
94799551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94899551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
94999551766SMark Adams 
95099551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
95199551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95299551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
95399551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95499551766SMark Adams 
95599551766SMark Adams         h_mat->offdiag.i = d_bi64;
95699551766SMark Adams         h_mat->offdiag.j = d_bj64;
95799551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
95899551766SMark Adams 
959365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
960365b711fSMark Adams       }
961365b711fSMark Adams #else
96299551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
96399551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
96499551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
965365b711fSMark Adams #endif
966042217e8SBarry Smith       h_mat->offdiag.a = ba;
967042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
968042217e8SBarry Smith 
96999551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
97099551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
971042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
972042217e8SBarry Smith     }
973042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
974042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
975042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
976042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
97749b994a9SMark Adams     h_mat->M      = A->cmap->N;
978365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
979365b711fSMark Adams     {
9802c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
98199551766SMark Adams       int        *h_ai32, *h_aj32;
98299551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
983365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
98499551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98599551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
98699551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98799551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
98899551766SMark Adams 
98999551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
99099551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99199551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
99299551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99399551766SMark Adams 
99499551766SMark Adams       h_mat->diag.i = d_ai64;
99599551766SMark Adams       h_mat->diag.j = d_aj64;
99699551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
99799551766SMark Adams 
998365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
999365b711fSMark Adams     }
1000365b711fSMark Adams #else
100199551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
100299551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
100399551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
1004365b711fSMark Adams #endif
1005042217e8SBarry Smith     h_mat->diag.a = aa;
1006042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
1007042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
1008042217e8SBarry Smith     // copy pointers and metadata to device
1009042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1010042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1011042217e8SBarry Smith   } else {
1012042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1013042217e8SBarry Smith   }
1014042217e8SBarry Smith   *B = d_mat;
1015042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
10163fa6b06aSMark Adams   PetscFunctionReturn(0);
10173fa6b06aSMark Adams }
1018