xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 219fbbafcab96f1b7f27d5f571f31c0ed2202c25)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24*219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
257e8381f9SStefano Zampini {
267e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
277e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
287e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
297e8381f9SStefano Zampini   PetscErrorCode     ierr;
307e8381f9SStefano Zampini 
317e8381f9SStefano Zampini   PetscFunctionBegin;
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
53*219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54*219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
56*219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57*219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
607e8381f9SStefano Zampini   A->num_ass++;
617e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
627e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
634eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
647e8381f9SStefano Zampini   PetscFunctionReturn(0);
657e8381f9SStefano Zampini }
667e8381f9SStefano Zampini 
677e8381f9SStefano Zampini template <typename Tuple>
687e8381f9SStefano Zampini struct IsNotOffDiagT
697e8381f9SStefano Zampini {
707e8381f9SStefano Zampini   PetscInt _cstart,_cend;
717e8381f9SStefano Zampini 
727e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
737e8381f9SStefano Zampini   __host__ __device__
747e8381f9SStefano Zampini   inline bool operator()(Tuple t)
757e8381f9SStefano Zampini   {
767e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
777e8381f9SStefano Zampini   }
787e8381f9SStefano Zampini };
797e8381f9SStefano Zampini 
807e8381f9SStefano Zampini struct IsOffDiag
817e8381f9SStefano Zampini {
827e8381f9SStefano Zampini   PetscInt _cstart,_cend;
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
857e8381f9SStefano Zampini   __host__ __device__
867e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
877e8381f9SStefano Zampini   {
887e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
927e8381f9SStefano Zampini struct GlobToLoc
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _start;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return c - _start;
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
104*219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1077e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1087e8381f9SStefano Zampini   PetscErrorCode         ierr;
10982a78a4eSJed Brown   PetscInt               N,*jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
111ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini   cudaError_t            cerr;
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini   PetscFunctionBegin;
1177e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1187e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1197e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1237e8381f9SStefano Zampini 
1247e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1257e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1267e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1277e8381f9SStefano Zampini   delete cusp->coo_p;
1287e8381f9SStefano Zampini   delete cusp->coo_pw;
1297e8381f9SStefano Zampini   cusp->coo_p = NULL;
1307e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1327e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1337e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1347e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1357e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1367e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1377e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1387e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1397e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1407e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1417e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1427e8381f9SStefano Zampini   }
1437e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1447e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1457e8381f9SStefano Zampini 
1467e8381f9SStefano Zampini   /* from global to local */
1477e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1487e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
14908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
152ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1537e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1547e8381f9SStefano Zampini   auto o_j = d_j.begin();
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1577e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
158ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
15908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1607e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
161ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1627e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1637e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
16682a78a4eSJed Brown   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
1672c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1687e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1697e8381f9SStefano Zampini 
1707e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1787e8381f9SStefano Zampini 
1797e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
180*219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181*219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1837e8381f9SStefano Zampini 
1847e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188a0e72f99SJunchao Zhang   /*
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1907e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191a0e72f99SJunchao Zhang   */
1927e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1947e8381f9SStefano Zampini   B->nonzerostate++;
1957e8381f9SStefano Zampini 
1967e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1997e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2007e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2017e8381f9SStefano Zampini   PetscFunctionReturn(0);
2027e8381f9SStefano Zampini }
2039ae82921SPaul Mullowney 
204*219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205*219fbbafSJunchao Zhang {
206*219fbbafSJunchao Zhang   PetscErrorCode         ierr;
207*219fbbafSJunchao Zhang   Mat                    newmat;
208*219fbbafSJunchao Zhang   Mat_MPIAIJ             *mpiaij;
209*219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE     *mpidev;
210*219fbbafSJunchao Zhang   PetscInt               coo_basic = 1;
211*219fbbafSJunchao Zhang   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212*219fbbafSJunchao Zhang   PetscInt               rstart,rend;
213*219fbbafSJunchao Zhang   cudaError_t            cerr;
214*219fbbafSJunchao Zhang 
215*219fbbafSJunchao Zhang   PetscFunctionBegin;
216*219fbbafSJunchao Zhang   if (coo_i) {
217*219fbbafSJunchao Zhang     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218*219fbbafSJunchao Zhang     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219*219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
220*219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221*219fbbafSJunchao Zhang         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222*219fbbafSJunchao Zhang       }
223*219fbbafSJunchao Zhang     }
224*219fbbafSJunchao Zhang   }
225*219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
226*219fbbafSJunchao Zhang   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227*219fbbafSJunchao Zhang 
228*219fbbafSJunchao Zhang   if (coo_basic) {
229*219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230*219fbbafSJunchao Zhang   } else {
231*219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232*219fbbafSJunchao Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233*219fbbafSJunchao Zhang     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234*219fbbafSJunchao Zhang     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235*219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236*219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237*219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238*219fbbafSJunchao Zhang     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239*219fbbafSJunchao Zhang     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240*219fbbafSJunchao Zhang     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
241*219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
242*219fbbafSJunchao Zhang     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
243*219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
244*219fbbafSJunchao Zhang 
245*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
246*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
247*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
248*219fbbafSJunchao Zhang 
249*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
250*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
251*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
252*219fbbafSJunchao Zhang 
253*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
254*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
255*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
256*219fbbafSJunchao Zhang 
257*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
258*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
259*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
260*219fbbafSJunchao Zhang 
261*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
262*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
263*219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
264*219fbbafSJunchao Zhang 
265*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
266*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
267*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268*219fbbafSJunchao Zhang 
269*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
271*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272*219fbbafSJunchao Zhang 
273*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
275*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276*219fbbafSJunchao Zhang 
277*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
279*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280*219fbbafSJunchao Zhang 
281*219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282*219fbbafSJunchao Zhang   }
283*219fbbafSJunchao Zhang   PetscFunctionReturn(0);
284*219fbbafSJunchao Zhang }
285*219fbbafSJunchao Zhang 
286*219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
287*219fbbafSJunchao Zhang {
288*219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
289*219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
290*219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
291*219fbbafSJunchao Zhang }
292*219fbbafSJunchao Zhang 
293*219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
294*219fbbafSJunchao Zhang {
295*219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
296*219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
297*219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
298*219fbbafSJunchao Zhang }
299*219fbbafSJunchao Zhang 
300*219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
301*219fbbafSJunchao Zhang {
302*219fbbafSJunchao Zhang   PetscErrorCode                 ierr;
303*219fbbafSJunchao Zhang   cudaError_t                    cerr;
304*219fbbafSJunchao Zhang   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
305*219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
306*219fbbafSJunchao Zhang   Mat                            A = mpiaij->A,B = mpiaij->B;
307*219fbbafSJunchao Zhang   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
308*219fbbafSJunchao Zhang   PetscScalar                    *Aa,*Ba;
309*219fbbafSJunchao Zhang   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
310*219fbbafSJunchao Zhang   const PetscScalar              *v1 = v;
311*219fbbafSJunchao Zhang   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
312*219fbbafSJunchao Zhang   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
313*219fbbafSJunchao Zhang   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
314*219fbbafSJunchao Zhang   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
315*219fbbafSJunchao Zhang   PetscMemType                   memtype;
316*219fbbafSJunchao Zhang 
317*219fbbafSJunchao Zhang   PetscFunctionBegin;
318*219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
319*219fbbafSJunchao Zhang     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
320*219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
321*219fbbafSJunchao Zhang       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
322*219fbbafSJunchao Zhang       if (v) {cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);}
323*219fbbafSJunchao Zhang       else {cerr = cudaMemset((void*)v1,0,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);}
324*219fbbafSJunchao Zhang     }
325*219fbbafSJunchao Zhang 
326*219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
327*219fbbafSJunchao Zhang       ierr = MatZeroEntries(A);CHKERRQ(ierr);
328*219fbbafSJunchao Zhang       ierr = MatZeroEntries(B);CHKERRQ(ierr);
329*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
330*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
331*219fbbafSJunchao Zhang     } else {
332*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
333*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
334*219fbbafSJunchao Zhang     }
335*219fbbafSJunchao Zhang 
336*219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
337*219fbbafSJunchao Zhang     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
338*219fbbafSJunchao Zhang 
339*219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
340*219fbbafSJunchao Zhang     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
341*219fbbafSJunchao Zhang     /* Add local entries to A and B */
342*219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
343*219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
344*219fbbafSJunchao Zhang     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
345*219fbbafSJunchao Zhang 
346*219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
347*219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
348*219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
349*219fbbafSJunchao Zhang 
350*219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
351*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
352*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
353*219fbbafSJunchao Zhang     } else {
354*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
355*219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
356*219fbbafSJunchao Zhang     }
357*219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
358*219fbbafSJunchao Zhang   } else {
359*219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
360*219fbbafSJunchao Zhang   }
361*219fbbafSJunchao Zhang   PetscFunctionReturn(0);
362*219fbbafSJunchao Zhang }
363*219fbbafSJunchao Zhang 
364ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
365ed502f03SStefano Zampini {
366ed502f03SStefano Zampini   Mat            Ad,Ao;
367ed502f03SStefano Zampini   const PetscInt *cmap;
368ed502f03SStefano Zampini   PetscErrorCode ierr;
369ed502f03SStefano Zampini 
370ed502f03SStefano Zampini   PetscFunctionBegin;
371ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
372ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
373ed502f03SStefano Zampini   if (glob) {
374ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
375ed502f03SStefano Zampini 
376ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
377ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
378ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
379ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
380ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
381ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
382ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
383ed502f03SStefano Zampini   }
384ed502f03SStefano Zampini   PetscFunctionReturn(0);
385ed502f03SStefano Zampini }
386ed502f03SStefano Zampini 
3879ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
3889ae82921SPaul Mullowney {
389bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
390bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
3919ae82921SPaul Mullowney   PetscErrorCode     ierr;
3929ae82921SPaul Mullowney   PetscInt           i;
3939ae82921SPaul Mullowney 
3949ae82921SPaul Mullowney   PetscFunctionBegin;
3959ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
3969ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
3977e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
3989ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
3992c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4009ae82921SPaul Mullowney     }
4019ae82921SPaul Mullowney   }
4027e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4039ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4042c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4059ae82921SPaul Mullowney     }
4069ae82921SPaul Mullowney   }
4076a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4086a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
4096a29ce69SStefano Zampini #else
4106a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
4116a29ce69SStefano Zampini #endif
4126a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
4136a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
4146a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
4156a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4166a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
4176a29ce69SStefano Zampini   if (!b->A) {
4189ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
4199ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
4203bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
4216a29ce69SStefano Zampini   }
4226a29ce69SStefano Zampini   if (!b->B) {
4236a29ce69SStefano Zampini     PetscMPIInt size;
42455b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
4259ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
4266a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
4273bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
4289ae82921SPaul Mullowney   }
429d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
430d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4316a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
4326a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
4339ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
4349ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
435e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
436e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
437b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
438b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
439a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
440b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
441b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
442a0e72f99SJunchao Zhang   */
4439ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4449ae82921SPaul Mullowney   PetscFunctionReturn(0);
4459ae82921SPaul Mullowney }
446e057df02SPaul Mullowney 
4479ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4489ae82921SPaul Mullowney {
4499ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
4509ae82921SPaul Mullowney   PetscErrorCode ierr;
4519ae82921SPaul Mullowney   PetscInt       nt;
4529ae82921SPaul Mullowney 
4539ae82921SPaul Mullowney   PetscFunctionBegin;
4549ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4552c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
45656258e06SRichard Tran Mills   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
45756258e06SRichard Tran Mills   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
4589ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4594d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
4609ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4619ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
4629ae82921SPaul Mullowney   PetscFunctionReturn(0);
4639ae82921SPaul Mullowney }
464ca45077fSPaul Mullowney 
4653fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4663fa6b06aSMark Adams {
4673fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4683fa6b06aSMark Adams   PetscErrorCode ierr;
4693fa6b06aSMark Adams 
4703fa6b06aSMark Adams   PetscFunctionBegin;
4713fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
4723fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
4733fa6b06aSMark Adams   PetscFunctionReturn(0);
4743fa6b06aSMark Adams }
475042217e8SBarry Smith 
476fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
477fdc842d1SBarry Smith {
478fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
479fdc842d1SBarry Smith   PetscErrorCode ierr;
480fdc842d1SBarry Smith   PetscInt       nt;
481fdc842d1SBarry Smith 
482fdc842d1SBarry Smith   PetscFunctionBegin;
483fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4842c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
485fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4864d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
487fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
488fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
489fdc842d1SBarry Smith   PetscFunctionReturn(0);
490fdc842d1SBarry Smith }
491fdc842d1SBarry Smith 
492ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
493ca45077fSPaul Mullowney {
494ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
495ca45077fSPaul Mullowney   PetscErrorCode ierr;
496ca45077fSPaul Mullowney   PetscInt       nt;
497ca45077fSPaul Mullowney 
498ca45077fSPaul Mullowney   PetscFunctionBegin;
499ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
5002c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
5019b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
502ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
5039b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
5049b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
505ca45077fSPaul Mullowney   PetscFunctionReturn(0);
506ca45077fSPaul Mullowney }
5079ae82921SPaul Mullowney 
508e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
509ca45077fSPaul Mullowney {
510ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
511bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
512e057df02SPaul Mullowney 
513ca45077fSPaul Mullowney   PetscFunctionBegin;
514e057df02SPaul Mullowney   switch (op) {
515e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
516e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
517045c96e1SPaul Mullowney     break;
518e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
519e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
520045c96e1SPaul Mullowney     break;
521e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
522e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
523e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
524045c96e1SPaul Mullowney     break;
525e057df02SPaul Mullowney   default:
52698921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
527045c96e1SPaul Mullowney   }
528ca45077fSPaul Mullowney   PetscFunctionReturn(0);
529ca45077fSPaul Mullowney }
530e057df02SPaul Mullowney 
5314416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
532e057df02SPaul Mullowney {
533e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
534e057df02SPaul Mullowney   PetscErrorCode           ierr;
535e057df02SPaul Mullowney   PetscBool                flg;
536a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
537a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5385fd66863SKarl Rupp 
539e057df02SPaul Mullowney   PetscFunctionBegin;
540e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
541e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
542e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
543a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
544e057df02SPaul Mullowney     if (flg) {
545e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
546e057df02SPaul Mullowney     }
547e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
548a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
549e057df02SPaul Mullowney     if (flg) {
550e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
551e057df02SPaul Mullowney     }
552e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
553a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
554e057df02SPaul Mullowney     if (flg) {
555e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
556e057df02SPaul Mullowney     }
557e057df02SPaul Mullowney   }
5580af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
559e057df02SPaul Mullowney   PetscFunctionReturn(0);
560e057df02SPaul Mullowney }
561e057df02SPaul Mullowney 
56234d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
56334d6c7a5SJose E. Roman {
56434d6c7a5SJose E. Roman   PetscErrorCode     ierr;
5653fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
566042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
567042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
568042217e8SBarry Smith 
56934d6c7a5SJose E. Roman   PetscFunctionBegin;
57034d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
571042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
572042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
573042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
574042217e8SBarry Smith     cudaError_t                cerr;
575042217e8SBarry Smith 
576042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
577042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
578042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
579042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
58099551766SMark Adams     if (h_mat->allocated_indices) {
58199551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
58299551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
58399551766SMark Adams       if (h_mat->offdiag.j) {
58499551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
58599551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
58699551766SMark Adams       }
58799551766SMark Adams     }
588042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
589042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
590042217e8SBarry Smith     cusp->deviceMat = NULL;
5913fa6b06aSMark Adams   }
59234d6c7a5SJose E. Roman   PetscFunctionReturn(0);
59334d6c7a5SJose E. Roman }
59434d6c7a5SJose E. Roman 
595bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
596bbf3fe20SPaul Mullowney {
597bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
598*219fbbafSJunchao Zhang   cudaError_t        cerr;
5993fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
6003fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
601b06137fdSPaul Mullowney   cusparseStatus_t   stat;
602bbf3fe20SPaul Mullowney 
603bbf3fe20SPaul Mullowney   PetscFunctionBegin;
6042c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6053fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
606042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
607042217e8SBarry Smith 
6083fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
609042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
610042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
611042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
61299551766SMark Adams     if (h_mat->allocated_indices) {
61399551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
61499551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
61599551766SMark Adams       if (h_mat->offdiag.j) {
61699551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
61799551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
61899551766SMark Adams       }
61999551766SMark Adams     }
620042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
621042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
6223fa6b06aSMark Adams   }
623bbf3fe20SPaul Mullowney   try {
6243fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
6253fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
62657d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
627a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
62817403302SKarl Rupp     if (cusparseStruct->stream) {
629042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
63017403302SKarl Rupp     }
631a0e72f99SJunchao Zhang     */
632*219fbbafSJunchao Zhang     /* Extended COO */
633*219fbbafSJunchao Zhang     if (cusparseStruct->use_extended_coo) {
634*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
635*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
636*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
637*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
638*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
639*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
640*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
641*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
642*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
643*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
644*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
645*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
646*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
647*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
648*219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
649*219fbbafSJunchao Zhang     } else {
6507e8381f9SStefano Zampini       delete cusparseStruct->coo_p;
6517e8381f9SStefano Zampini       delete cusparseStruct->coo_pw;
652*219fbbafSJunchao Zhang     }
653bbf3fe20SPaul Mullowney     delete cusparseStruct;
654bbf3fe20SPaul Mullowney   } catch(char *ex) {
65598921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
656bbf3fe20SPaul Mullowney   }
6573338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
658ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
6597e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
6607e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
6613338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
662ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
663bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
664bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
665bbf3fe20SPaul Mullowney }
666ca45077fSPaul Mullowney 
6673338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6689ae82921SPaul Mullowney {
6699ae82921SPaul Mullowney   PetscErrorCode     ierr;
670bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
671b06137fdSPaul Mullowney   cusparseStatus_t   stat;
6723338378cSStefano Zampini   Mat                A;
6739ae82921SPaul Mullowney 
6749ae82921SPaul Mullowney   PetscFunctionBegin;
675*219fbbafSJunchao Zhang   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
6763338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
6773338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
6783338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
6793338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
6803338378cSStefano Zampini   }
6813338378cSStefano Zampini   A = *newmat;
6826f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
68334136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
68434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
68534136279SStefano Zampini 
686bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
687d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
688d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
689d98d7c49SStefano Zampini   if (a->lvec) {
690d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
691d98d7c49SStefano Zampini   }
692d98d7c49SStefano Zampini 
6933338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
694*219fbbafSJunchao Zhang     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
695*219fbbafSJunchao Zhang     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
696*219fbbafSJunchao Zhang     a->spptr = cusp;
6973338378cSStefano Zampini   }
6982205254eSKarl Rupp 
69934d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
700bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
701fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
702bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
703bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
704bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
7053fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
7064e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
7072205254eSKarl Rupp 
708bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
709ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
7103338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
711bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
7127e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
7137e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
714ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
715ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
716ae48a8d0SStefano Zampini #endif
7179ae82921SPaul Mullowney   PetscFunctionReturn(0);
7189ae82921SPaul Mullowney }
7199ae82921SPaul Mullowney 
7203338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
7213338378cSStefano Zampini {
7223338378cSStefano Zampini   PetscErrorCode ierr;
7233338378cSStefano Zampini 
7243338378cSStefano Zampini   PetscFunctionBegin;
725a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
7263338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
7273338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
7283338378cSStefano Zampini   PetscFunctionReturn(0);
7293338378cSStefano Zampini }
7303338378cSStefano Zampini 
7313f9c0db1SPaul Mullowney /*@
7323f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
733e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
7343f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
735e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
736e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
737e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7389ae82921SPaul Mullowney 
739d083f849SBarry Smith    Collective
740e057df02SPaul Mullowney 
741e057df02SPaul Mullowney    Input Parameters:
742e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
743e057df02SPaul Mullowney .  m - number of rows
744e057df02SPaul Mullowney .  n - number of columns
745e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
746e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7470298fd71SBarry Smith          (possibly different for each row) or NULL
748e057df02SPaul Mullowney 
749e057df02SPaul Mullowney    Output Parameter:
750e057df02SPaul Mullowney .  A - the matrix
751e057df02SPaul Mullowney 
752e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
753e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
754e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
755e057df02SPaul Mullowney 
756e057df02SPaul Mullowney    Notes:
757e057df02SPaul Mullowney    If nnz is given then nz is ignored
758e057df02SPaul Mullowney 
759e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
760e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
761e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
762e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
763e057df02SPaul Mullowney 
764e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7650298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
766e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
767e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
768e057df02SPaul Mullowney 
769e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
770e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
771e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
772e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
773e057df02SPaul Mullowney 
774e057df02SPaul Mullowney    Level: intermediate
775e057df02SPaul Mullowney 
776e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
777e057df02SPaul Mullowney @*/
7789ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7799ae82921SPaul Mullowney {
7809ae82921SPaul Mullowney   PetscErrorCode ierr;
7819ae82921SPaul Mullowney   PetscMPIInt    size;
7829ae82921SPaul Mullowney 
7839ae82921SPaul Mullowney   PetscFunctionBegin;
7849ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7859ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
786ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
7879ae82921SPaul Mullowney   if (size > 1) {
7889ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
7899ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
7909ae82921SPaul Mullowney   } else {
7919ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7929ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
7939ae82921SPaul Mullowney   }
7949ae82921SPaul Mullowney   PetscFunctionReturn(0);
7959ae82921SPaul Mullowney }
7969ae82921SPaul Mullowney 
7973ca39a21SBarry Smith /*MC
7986bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
799e057df02SPaul Mullowney 
8002692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
8012692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
8022692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
8039ae82921SPaul Mullowney 
8049ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
8059ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
8069ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
8079ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
8089ae82921SPaul Mullowney    the above preallocation routines for simplicity.
8099ae82921SPaul Mullowney 
8109ae82921SPaul Mullowney    Options Database Keys:
811e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
8128468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8138468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8148468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8159ae82921SPaul Mullowney 
8169ae82921SPaul Mullowney   Level: beginner
8179ae82921SPaul Mullowney 
8186bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
8196bb69460SJunchao Zhang M*/
8206bb69460SJunchao Zhang 
8216bb69460SJunchao Zhang /*MC
8226bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
8236bb69460SJunchao Zhang 
8246bb69460SJunchao Zhang   Level: beginner
8256bb69460SJunchao Zhang 
8266bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
8279ae82921SPaul Mullowney M*/
8283fa6b06aSMark Adams 
829042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
830042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
8313fa6b06aSMark Adams {
832042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
833042217e8SBarry Smith   PetscMPIInt                size;
8343fa6b06aSMark Adams   PetscErrorCode             ierr;
835042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
836042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
83799551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
838042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
839042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
8403fa6b06aSMark Adams 
8413fa6b06aSMark Adams   PetscFunctionBegin;
8422c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
843042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8443fa6b06aSMark Adams     *B = NULL;
8453fa6b06aSMark Adams     PetscFunctionReturn(0);
8463fa6b06aSMark Adams   }
847042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
84899551766SMark Adams   // get jaca
8493fa6b06aSMark Adams   if (size == 1) {
850042217e8SBarry Smith     PetscBool isseqaij;
851042217e8SBarry Smith 
852042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
853042217e8SBarry Smith     if (isseqaij) {
8543fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
8552c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
856042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
857042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
858042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8593fa6b06aSMark Adams     } else {
8603fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8612c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
862042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8633fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
864042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
865042217e8SBarry Smith       d_mat = spptr->deviceMat;
866042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
8673fa6b06aSMark Adams     }
868042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
869042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
8702c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
871042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
872042217e8SBarry Smith       bi = NULL;
873042217e8SBarry Smith       bj = NULL;
874042217e8SBarry Smith       ba = NULL;
875042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
876042217e8SBarry Smith   } else {
877042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8782c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
879042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
88099551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
881042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8823fa6b06aSMark Adams 
8832c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
884042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
885042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
8862c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
887042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
888042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
889042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
890042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
891042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
8922c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
8932c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
894042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
895042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8963fa6b06aSMark Adams       if (jacb->compressedrow.use) {
897042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
898042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
899042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
9003fa6b06aSMark Adams         }
901042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
902042217e8SBarry Smith       } else {
903042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
904042217e8SBarry Smith       }
905042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
906042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
907042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
908042217e8SBarry Smith     d_mat = spptr->deviceMat;
909042217e8SBarry Smith   }
9103fa6b06aSMark Adams   if (jaca->compressedrow.use) {
911042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
912042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
913042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
9143fa6b06aSMark Adams     }
915042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
9163fa6b06aSMark Adams   } else {
917042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
9183fa6b06aSMark Adams   }
919042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
920042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
921042217e8SBarry Smith 
922042217e8SBarry Smith   if (!d_mat) {
923042217e8SBarry Smith     cudaError_t                cerr;
924042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
925042217e8SBarry Smith 
926042217e8SBarry Smith     // create and populate strucy on host and copy on device
927042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
928042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
929042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
930042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
931042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
93299551766SMark Adams       PetscInt   *colmap;
933042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
934042217e8SBarry Smith 
9352c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
936042217e8SBarry Smith 
937042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
938042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
939365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
940365b711fSMark Adams       { // have to make a long version of these
94199551766SMark Adams         int        *h_bi32, *h_bj32;
94299551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
943365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
94499551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94599551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
94699551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94799551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
94899551766SMark Adams 
94999551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
95099551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95199551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
95299551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95399551766SMark Adams 
95499551766SMark Adams         h_mat->offdiag.i = d_bi64;
95599551766SMark Adams         h_mat->offdiag.j = d_bj64;
95699551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
95799551766SMark Adams 
958365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
959365b711fSMark Adams       }
960365b711fSMark Adams #else
96199551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
96299551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
96399551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
964365b711fSMark Adams #endif
965042217e8SBarry Smith       h_mat->offdiag.a = ba;
966042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
967042217e8SBarry Smith 
96899551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
96999551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
970042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
971042217e8SBarry Smith     }
972042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
973042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
974042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
975042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
97649b994a9SMark Adams     h_mat->M      = A->cmap->N;
977365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
978365b711fSMark Adams     {
9792c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
98099551766SMark Adams       int        *h_ai32, *h_aj32;
98199551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
982365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
98399551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98499551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
98599551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98699551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
98799551766SMark Adams 
98899551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
98999551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99099551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
99199551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99299551766SMark Adams 
99399551766SMark Adams       h_mat->diag.i = d_ai64;
99499551766SMark Adams       h_mat->diag.j = d_aj64;
99599551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
99699551766SMark Adams 
997365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
998365b711fSMark Adams     }
999365b711fSMark Adams #else
100099551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
100199551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
100299551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
1003365b711fSMark Adams #endif
1004042217e8SBarry Smith     h_mat->diag.a = aa;
1005042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
1006042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
1007042217e8SBarry Smith     // copy pointers and metadata to device
1008042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1009042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1010042217e8SBarry Smith   } else {
1011042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1012042217e8SBarry Smith   }
1013042217e8SBarry Smith   *B = d_mat;
1014042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
10153fa6b06aSMark Adams   PetscFunctionReturn(0);
10163fa6b06aSMark Adams }
1017