xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 08401ef684002a709c6d3db98a0c9f54a8bcf1ec)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25cbc6b225SStefano Zampini {
26cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28cbc6b225SStefano Zampini 
29cbc6b225SStefano Zampini   PetscFunctionBegin;
30cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
31cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d));
339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d));
369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
47cbc6b225SStefano Zampini   }
48cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
49cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
50cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
51cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
52cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
53cbc6b225SStefano Zampini   PetscFunctionReturn(0);
54cbc6b225SStefano Zampini }
55cbc6b225SStefano Zampini 
56219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
577e8381f9SStefano Zampini {
587e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
597e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
607e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
617e8381f9SStefano Zampini 
627e8381f9SStefano Zampini   PetscFunctionBegin;
63e61fc153SStefano Zampini   if (cusp->coo_p && v) {
6408391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6508391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
6608391a17SStefano Zampini 
6708391a17SStefano Zampini     if (isCudaMem(v)) {
6808391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6908391a17SStefano Zampini     } else {
70e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
71e61fc153SStefano Zampini       w->assign(v,v+n);
729566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
7308391a17SStefano Zampini       d_v = w->data();
7408391a17SStefano Zampini     }
7508391a17SStefano Zampini 
7608391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
777e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
7808391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
797e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
809566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
817e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
829566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
83e61fc153SStefano Zampini     delete w;
849566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
859566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
867e8381f9SStefano Zampini   } else {
879566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
889566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini   PetscFunctionReturn(0);
917e8381f9SStefano Zampini }
927e8381f9SStefano Zampini 
937e8381f9SStefano Zampini template <typename Tuple>
947e8381f9SStefano Zampini struct IsNotOffDiagT
957e8381f9SStefano Zampini {
967e8381f9SStefano Zampini   PetscInt _cstart,_cend;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
997e8381f9SStefano Zampini   __host__ __device__
1007e8381f9SStefano Zampini   inline bool operator()(Tuple t)
1017e8381f9SStefano Zampini   {
1027e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
1037e8381f9SStefano Zampini   }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1067e8381f9SStefano Zampini struct IsOffDiag
1077e8381f9SStefano Zampini {
1087e8381f9SStefano Zampini   PetscInt _cstart,_cend;
1097e8381f9SStefano Zampini 
1107e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
1117e8381f9SStefano Zampini   __host__ __device__
1127e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
1137e8381f9SStefano Zampini   {
1147e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
1157e8381f9SStefano Zampini   }
1167e8381f9SStefano Zampini };
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini struct GlobToLoc
1197e8381f9SStefano Zampini {
1207e8381f9SStefano Zampini   PetscInt _start;
1217e8381f9SStefano Zampini 
1227e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
1237e8381f9SStefano Zampini   __host__ __device__
1247e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1257e8381f9SStefano Zampini   {
1267e8381f9SStefano Zampini     return c - _start;
1277e8381f9SStefano Zampini   }
1287e8381f9SStefano Zampini };
1297e8381f9SStefano Zampini 
130219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1317e8381f9SStefano Zampini {
1327e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1337e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
13482a78a4eSJed Brown   PetscInt               N,*jj;
1357e8381f9SStefano Zampini   size_t                 noff = 0;
136ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1377e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1387e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1397e8381f9SStefano Zampini 
1407e8381f9SStefano Zampini   PetscFunctionBegin;
1419566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->A));
1429566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
1437e8381f9SStefano Zampini 
1449566063dSJacob Faibussowitsch   PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
1457e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1467e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1479566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1487e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1497e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1507e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1517e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1527e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1537e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1547e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1557e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1567e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1577e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1587e8381f9SStefano Zampini   }
1597e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1607e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1617e8381f9SStefano Zampini 
1627e8381f9SStefano Zampini   /* from global to local */
1637e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1647e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
1659566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1667e8381f9SStefano Zampini 
1677e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1689566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
1699566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1707e8381f9SStefano Zampini   auto o_j = d_j.begin();
1719566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
172ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1737e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
174ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
1759566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1767e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
177cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1789566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(noff,&b->garray));
1799566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1809566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
1819566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
1829566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
1839566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
1842c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1859566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
1867e8381f9SStefano Zampini 
1879566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
1889566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
1899566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
1909566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
1919566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
1929566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
1939566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
1949566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
1957e8381f9SStefano Zampini 
1967e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1979566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
1989566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
1999566063dSJacob Faibussowitsch   PetscCall(PetscFree(jj));
2007e8381f9SStefano Zampini 
2019566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
2029566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
2037e8381f9SStefano Zampini 
2049566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
2059566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
2069566063dSJacob Faibussowitsch   PetscCall(MatSetUpMultiply_MPIAIJ(B));
2077e8381f9SStefano Zampini   PetscFunctionReturn(0);
2087e8381f9SStefano Zampini }
2099ae82921SPaul Mullowney 
210219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
211219fbbafSJunchao Zhang {
212cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
213219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
214cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
215219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
216219fbbafSJunchao Zhang   PetscInt            rstart,rend;
217219fbbafSJunchao Zhang 
218219fbbafSJunchao Zhang   PetscFunctionBegin;
2199566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
2209566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
221cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
2229566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&mpiaij->colmap));
223cbc6b225SStefano Zampini #else
2249566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
225cbc6b225SStefano Zampini #endif
2269566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
227cbc6b225SStefano Zampini   mat->assembled                                                                      = PETSC_FALSE;
228cbc6b225SStefano Zampini   mat->was_assembled                                                                  = PETSC_FALSE;
2299566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
2309566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
231219fbbafSJunchao Zhang   if (coo_i) {
2329566063dSJacob Faibussowitsch     PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
2339566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i,&mtype));
234219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
235219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
236cbc6b225SStefano Zampini         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
237219fbbafSJunchao Zhang       }
238219fbbafSJunchao Zhang     }
239219fbbafSJunchao Zhang   }
240219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
2419566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
242219fbbafSJunchao Zhang   if (coo_basic) {
2439566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
244219fbbafSJunchao Zhang   } else {
2459566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
246cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
247cbc6b225SStefano Zampini     /* creates the GPU memory */
2489566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2499566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
250cbc6b225SStefano Zampini     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
251219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
252219fbbafSJunchao Zhang 
2539566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount)));
2549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount)));
2559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
256219fbbafSJunchao Zhang 
2579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount)));
2589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount)));
2599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
260219fbbafSJunchao Zhang 
2619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
2629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
2639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
264219fbbafSJunchao Zhang 
2659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
2669566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
2679566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
268219fbbafSJunchao Zhang 
2699566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
2709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
2719566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
272219fbbafSJunchao Zhang 
2739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
2749566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2759566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
276219fbbafSJunchao Zhang 
2779566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
2789566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2799566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
280219fbbafSJunchao Zhang 
2819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2829566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2839566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
284219fbbafSJunchao Zhang 
2859566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2869566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2879566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
288219fbbafSJunchao Zhang 
2899566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
290219fbbafSJunchao Zhang   }
291219fbbafSJunchao Zhang   PetscFunctionReturn(0);
292219fbbafSJunchao Zhang }
293219fbbafSJunchao Zhang 
294219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
295219fbbafSJunchao Zhang {
296219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
297219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
298219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
299219fbbafSJunchao Zhang }
300219fbbafSJunchao Zhang 
301219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
302219fbbafSJunchao Zhang {
303219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
304219fbbafSJunchao Zhang   const PetscCount grid_size  = gridDim.x * blockDim.x;
305219fbbafSJunchao Zhang   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
306219fbbafSJunchao Zhang }
307219fbbafSJunchao Zhang 
308219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
309219fbbafSJunchao Zhang {
310219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
311219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
312219fbbafSJunchao Zhang   Mat                 A      = mpiaij->A,B = mpiaij->B;
313219fbbafSJunchao Zhang   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
314cbc6b225SStefano Zampini   PetscScalar        *Aa,*Ba = NULL;
315219fbbafSJunchao Zhang   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
316219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
317219fbbafSJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
318219fbbafSJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
319219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
320219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
321219fbbafSJunchao Zhang   PetscMemType        memtype;
322219fbbafSJunchao Zhang 
323219fbbafSJunchao Zhang   PetscFunctionBegin;
324219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
325cbc6b225SStefano Zampini     PetscMPIInt size;
326cbc6b225SStefano Zampini 
3279566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
3289566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v,&memtype));
329219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3309566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
3319566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
332219fbbafSJunchao Zhang     }
333219fbbafSJunchao Zhang 
334219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
3369566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar)));
337cbc6b225SStefano Zampini       if (size > 1) {
3389566063dSJacob Faibussowitsch         PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
3399566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar)));
340cbc6b225SStefano Zampini       }
341219fbbafSJunchao Zhang     } else {
3429566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
3439566063dSJacob Faibussowitsch       if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba));
344219fbbafSJunchao Zhang     }
345219fbbafSJunchao Zhang 
346219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
347cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
348219fbbafSJunchao Zhang       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
3499566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
350cbc6b225SStefano Zampini     }
351219fbbafSJunchao Zhang 
352219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
3539566063dSJacob Faibussowitsch     if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
354219fbbafSJunchao Zhang     /* Add local entries to A and B */
355cbc6b225SStefano Zampini     if (Annz1) {
356219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
3579566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
358cbc6b225SStefano Zampini     }
359cbc6b225SStefano Zampini     if (Bnnz1) {
360219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
3619566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
362cbc6b225SStefano Zampini     }
3639566063dSJacob Faibussowitsch     if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
364219fbbafSJunchao Zhang 
365219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
366cbc6b225SStefano Zampini     if (Annz2) {
367219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
3689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
369cbc6b225SStefano Zampini     }
370cbc6b225SStefano Zampini     if (Bnnz2) {
371219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
3729566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
373cbc6b225SStefano Zampini     }
374219fbbafSJunchao Zhang 
375219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3769566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
3779566063dSJacob Faibussowitsch       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
378219fbbafSJunchao Zhang     } else {
3799566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
3809566063dSJacob Faibussowitsch       if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
381219fbbafSJunchao Zhang     }
3829566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1));
383219fbbafSJunchao Zhang   } else {
3849566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
385219fbbafSJunchao Zhang   }
386cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
387219fbbafSJunchao Zhang   PetscFunctionReturn(0);
388219fbbafSJunchao Zhang }
389219fbbafSJunchao Zhang 
390ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
391ed502f03SStefano Zampini {
392ed502f03SStefano Zampini   Mat             Ad,Ao;
393ed502f03SStefano Zampini   const PetscInt *cmap;
394ed502f03SStefano Zampini 
395ed502f03SStefano Zampini   PetscFunctionBegin;
3969566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
3979566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
398ed502f03SStefano Zampini   if (glob) {
399ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
400ed502f03SStefano Zampini 
4019566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad,NULL,&dn));
4029566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao,NULL,&on));
4039566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL));
4049566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn+on,&gidx));
405ed502f03SStefano Zampini     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
406ed502f03SStefano Zampini     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
4079566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
408ed502f03SStefano Zampini   }
409ed502f03SStefano Zampini   PetscFunctionReturn(0);
410ed502f03SStefano Zampini }
411ed502f03SStefano Zampini 
4129ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
4139ae82921SPaul Mullowney {
414bbf3fe20SPaul Mullowney   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
415bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
4169ae82921SPaul Mullowney   PetscInt           i;
4179ae82921SPaul Mullowney 
4189ae82921SPaul Mullowney   PetscFunctionBegin;
4199566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
4209566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
4217e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
4229ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
423*08401ef6SPierre Jolivet       PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4249ae82921SPaul Mullowney     }
4259ae82921SPaul Mullowney   }
4267e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4279ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
428*08401ef6SPierre Jolivet       PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4299ae82921SPaul Mullowney     }
4309ae82921SPaul Mullowney   }
4316a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4329566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&b->colmap));
4336a29ce69SStefano Zampini #else
4349566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
4356a29ce69SStefano Zampini #endif
4369566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
4379566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
4389566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
4396a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4409566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
4416a29ce69SStefano Zampini   if (!b->A) {
4429566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&b->A));
4439566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
4449566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
4456a29ce69SStefano Zampini   }
4466a29ce69SStefano Zampini   if (!b->B) {
4476a29ce69SStefano Zampini     PetscMPIInt size;
4489566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
4499566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF,&b->B));
4509566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
4519566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
4529ae82921SPaul Mullowney   }
4539566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE));
4549566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE));
4559566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A,B->boundtocpu));
4569566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B,B->boundtocpu));
4579566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
4589566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
4599566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
4609566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
4619ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4629ae82921SPaul Mullowney   PetscFunctionReturn(0);
4639ae82921SPaul Mullowney }
464e057df02SPaul Mullowney 
4659ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4669ae82921SPaul Mullowney {
4679ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
4689ae82921SPaul Mullowney 
4699ae82921SPaul Mullowney   PetscFunctionBegin;
4709566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4719566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A,xx,yy));
4729566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4739566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
4749ae82921SPaul Mullowney   PetscFunctionReturn(0);
4759ae82921SPaul Mullowney }
476ca45077fSPaul Mullowney 
4773fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4783fa6b06aSMark Adams {
4793fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4803fa6b06aSMark Adams 
4813fa6b06aSMark Adams   PetscFunctionBegin;
4829566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
4839566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
4843fa6b06aSMark Adams   PetscFunctionReturn(0);
4853fa6b06aSMark Adams }
486042217e8SBarry Smith 
487fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
488fdc842d1SBarry Smith {
489fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
490fdc842d1SBarry Smith 
491fdc842d1SBarry Smith   PetscFunctionBegin;
4929566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4939566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz));
4949566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4959566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
496fdc842d1SBarry Smith   PetscFunctionReturn(0);
497fdc842d1SBarry Smith }
498fdc842d1SBarry Smith 
499ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
500ca45077fSPaul Mullowney {
501ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
502ca45077fSPaul Mullowney 
503ca45077fSPaul Mullowney   PetscFunctionBegin;
5049566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
5059566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy));
5069566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
5079566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
508ca45077fSPaul Mullowney   PetscFunctionReturn(0);
509ca45077fSPaul Mullowney }
5109ae82921SPaul Mullowney 
511e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
512ca45077fSPaul Mullowney {
513ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
514bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
515e057df02SPaul Mullowney 
516ca45077fSPaul Mullowney   PetscFunctionBegin;
517e057df02SPaul Mullowney   switch (op) {
518e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
519e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
520045c96e1SPaul Mullowney     break;
521e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
522e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
523045c96e1SPaul Mullowney     break;
524e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
525e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
526e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
527045c96e1SPaul Mullowney     break;
528e057df02SPaul Mullowney   default:
52998921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
530045c96e1SPaul Mullowney   }
531ca45077fSPaul Mullowney   PetscFunctionReturn(0);
532ca45077fSPaul Mullowney }
533e057df02SPaul Mullowney 
5344416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
535e057df02SPaul Mullowney {
536e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
537e057df02SPaul Mullowney   PetscBool                flg;
538a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
539a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5405fd66863SKarl Rupp 
541e057df02SPaul Mullowney   PetscFunctionBegin;
5429566063dSJacob Faibussowitsch   PetscCall(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options"));
543e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
5449566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
5455f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5469566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
5479566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5485f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
5499566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
5509566063dSJacob Faibussowitsch     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5515f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5529566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
553e057df02SPaul Mullowney   }
5549566063dSJacob Faibussowitsch   PetscCall(PetscOptionsTail());
555e057df02SPaul Mullowney   PetscFunctionReturn(0);
556e057df02SPaul Mullowney }
557e057df02SPaul Mullowney 
55834d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
55934d6c7a5SJose E. Roman {
5603fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
561042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
562042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
563042217e8SBarry Smith 
56434d6c7a5SJose E. Roman   PetscFunctionBegin;
5659566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A,mode));
5669566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA));
567042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
568042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
569042217e8SBarry Smith 
5709566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
5719566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5729566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
5739566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
57499551766SMark Adams     if (h_mat->allocated_indices) {
5759566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5769566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
57799551766SMark Adams       if (h_mat->offdiag.j) {
5789566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5799566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
58099551766SMark Adams       }
58199551766SMark Adams     }
5829566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5839566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
584042217e8SBarry Smith     cusp->deviceMat = NULL;
5853fa6b06aSMark Adams   }
58634d6c7a5SJose E. Roman   PetscFunctionReturn(0);
58734d6c7a5SJose E. Roman }
58834d6c7a5SJose E. Roman 
589bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
590bbf3fe20SPaul Mullowney {
5913fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
5923fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
593bbf3fe20SPaul Mullowney 
594bbf3fe20SPaul Mullowney   PetscFunctionBegin;
59528b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
5963fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
597042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
598042217e8SBarry Smith 
5999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Have device matrix\n"));
6009566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
6019566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
6029566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
60399551766SMark Adams     if (h_mat->allocated_indices) {
6049566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
6059566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
60699551766SMark Adams       if (h_mat->offdiag.j) {
6079566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
6089566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
60999551766SMark Adams       }
61099551766SMark Adams     }
6119566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
6129566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
6133fa6b06aSMark Adams   }
614cbc6b225SStefano Zampini   /* Free COO */
6159566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
616acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
6179566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
6189566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
6199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
6209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
6219566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
6229566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
6239566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
624bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
625bbf3fe20SPaul Mullowney }
626ca45077fSPaul Mullowney 
6273338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6289ae82921SPaul Mullowney {
629bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
6303338378cSStefano Zampini   Mat         A;
6319ae82921SPaul Mullowney 
6329ae82921SPaul Mullowney   PetscFunctionBegin;
6339566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6349566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat));
6359566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
6363338378cSStefano Zampini   A = *newmat;
6376f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6389566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
6399566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype));
64034136279SStefano Zampini 
641bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
6429566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE));
6439566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE));
6449566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA));
645d98d7c49SStefano Zampini 
6463338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
647acbf8a88SJunchao Zhang     PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
6483338378cSStefano Zampini   }
6492205254eSKarl Rupp 
65034d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
651bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
652fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
653bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
654bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
655bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6563fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6574e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
6582205254eSKarl Rupp 
6599566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
6609566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6619566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6629566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6639566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
6649566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
665ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6669566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
667ae48a8d0SStefano Zampini #endif
6689ae82921SPaul Mullowney   PetscFunctionReturn(0);
6699ae82921SPaul Mullowney }
6709ae82921SPaul Mullowney 
6713338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
6723338378cSStefano Zampini {
6733338378cSStefano Zampini   PetscFunctionBegin;
6749566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6759566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
6769566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
6773338378cSStefano Zampini   PetscFunctionReturn(0);
6783338378cSStefano Zampini }
6793338378cSStefano Zampini 
6803f9c0db1SPaul Mullowney /*@
6813f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
682e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
6833f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
684e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
685e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
686e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
6879ae82921SPaul Mullowney 
688d083f849SBarry Smith    Collective
689e057df02SPaul Mullowney 
690e057df02SPaul Mullowney    Input Parameters:
691e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
692e057df02SPaul Mullowney .  m - number of rows
693e057df02SPaul Mullowney .  n - number of columns
694e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
695e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
6960298fd71SBarry Smith          (possibly different for each row) or NULL
697e057df02SPaul Mullowney 
698e057df02SPaul Mullowney    Output Parameter:
699e057df02SPaul Mullowney .  A - the matrix
700e057df02SPaul Mullowney 
701e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
702e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
703e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
704e057df02SPaul Mullowney 
705e057df02SPaul Mullowney    Notes:
706e057df02SPaul Mullowney    If nnz is given then nz is ignored
707e057df02SPaul Mullowney 
708e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
709e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
710e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
711e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
712e057df02SPaul Mullowney 
713e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7140298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
715e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
716e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
717e057df02SPaul Mullowney 
718e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
719e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
720e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
721e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
722e057df02SPaul Mullowney 
723e057df02SPaul Mullowney    Level: intermediate
724e057df02SPaul Mullowney 
725e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
726e057df02SPaul Mullowney @*/
7279ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7289ae82921SPaul Mullowney {
7299ae82921SPaul Mullowney   PetscMPIInt size;
7309ae82921SPaul Mullowney 
7319ae82921SPaul Mullowney   PetscFunctionBegin;
7329566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm,A));
7339566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A,m,n,M,N));
7349566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm,&size));
7359ae82921SPaul Mullowney   if (size > 1) {
7369566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE));
7379566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
7389ae82921SPaul Mullowney   } else {
7399566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE));
7409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
7419ae82921SPaul Mullowney   }
7429ae82921SPaul Mullowney   PetscFunctionReturn(0);
7439ae82921SPaul Mullowney }
7449ae82921SPaul Mullowney 
7453ca39a21SBarry Smith /*MC
7466bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
747e057df02SPaul Mullowney 
7482692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
7492692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
7502692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
7519ae82921SPaul Mullowney 
7529ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
7539ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
7549ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
7559ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7569ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7579ae82921SPaul Mullowney 
7589ae82921SPaul Mullowney    Options Database Keys:
759e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
7608468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7618468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7628468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7639ae82921SPaul Mullowney 
7649ae82921SPaul Mullowney   Level: beginner
7659ae82921SPaul Mullowney 
7666bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
7676bb69460SJunchao Zhang M*/
7686bb69460SJunchao Zhang 
7696bb69460SJunchao Zhang /*MC
7706bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
7716bb69460SJunchao Zhang 
7726bb69460SJunchao Zhang   Level: beginner
7736bb69460SJunchao Zhang 
7746bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
7759ae82921SPaul Mullowney M*/
7763fa6b06aSMark Adams 
777042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
778042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
7793fa6b06aSMark Adams {
780042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
781042217e8SBarry Smith   PetscMPIInt                size;
782042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
783042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
78499551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
785042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
786042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
7873fa6b06aSMark Adams 
7883fa6b06aSMark Adams   PetscFunctionBegin;
78928b400f6SJacob Faibussowitsch   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
790042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
7913fa6b06aSMark Adams     *B = NULL;
7923fa6b06aSMark Adams     PetscFunctionReturn(0);
7933fa6b06aSMark Adams   }
7949566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
79599551766SMark Adams   // get jaca
7963fa6b06aSMark Adams   if (size == 1) {
797042217e8SBarry Smith     PetscBool isseqaij;
798042217e8SBarry Smith 
7999566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
800042217e8SBarry Smith     if (isseqaij) {
8013fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
80228b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
803042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
804042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
8059566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
8063fa6b06aSMark Adams     } else {
8073fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
80828b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
809042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8103fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
811042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
812042217e8SBarry Smith       d_mat = spptr->deviceMat;
8139566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8143fa6b06aSMark Adams     }
815042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
816042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
81728b400f6SJacob Faibussowitsch       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
818042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
819042217e8SBarry Smith       bi = NULL;
820042217e8SBarry Smith       bj = NULL;
821042217e8SBarry Smith       ba = NULL;
822042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
823042217e8SBarry Smith   } else {
824042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
82528b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
826042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
82799551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
828042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8293fa6b06aSMark Adams 
830*08401ef6SPierre Jolivet     PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
831042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
832042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
833*08401ef6SPierre Jolivet     PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
834042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
8359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8369566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
837042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
838042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
83928b400f6SJacob Faibussowitsch       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
84028b400f6SJacob Faibussowitsch       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
841042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
842042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8433fa6b06aSMark Adams       if (jacb->compressedrow.use) {
844042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
845042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
846042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
8473fa6b06aSMark Adams         }
848042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
849042217e8SBarry Smith       } else {
850042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
851042217e8SBarry Smith       }
852042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
853042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
854042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
855042217e8SBarry Smith     d_mat = spptr->deviceMat;
856042217e8SBarry Smith   }
8573fa6b06aSMark Adams   if (jaca->compressedrow.use) {
858042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
859042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
860042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
8613fa6b06aSMark Adams     }
862042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8633fa6b06aSMark Adams   } else {
864042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8653fa6b06aSMark Adams   }
866042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
867042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
868042217e8SBarry Smith 
869042217e8SBarry Smith   if (!d_mat) {
870042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
871042217e8SBarry Smith 
872042217e8SBarry Smith     // create and populate strucy on host and copy on device
8739566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Create device matrix\n"));
8749566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
8759566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
876042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
877042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
87899551766SMark Adams       PetscInt   *colmap;
879042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
880042217e8SBarry Smith 
881*08401ef6SPierre Jolivet       PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
882042217e8SBarry Smith 
8839566063dSJacob Faibussowitsch       PetscCall(PetscCalloc1(N+1,&colmap));
884042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
885365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
886365b711fSMark Adams       { // have to make a long version of these
88799551766SMark Adams         int        *h_bi32, *h_bj32;
88899551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
8899566063dSJacob Faibussowitsch         PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
8909566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
89199551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
8929566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
89399551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
89499551766SMark Adams 
8959566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
8969566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
8979566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
8989566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
89999551766SMark Adams 
90099551766SMark Adams         h_mat->offdiag.i = d_bi64;
90199551766SMark Adams         h_mat->offdiag.j = d_bj64;
90299551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
90399551766SMark Adams 
9049566063dSJacob Faibussowitsch         PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
905365b711fSMark Adams       }
906365b711fSMark Adams #else
90799551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
90899551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
90999551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
910365b711fSMark Adams #endif
911042217e8SBarry Smith       h_mat->offdiag.a = ba;
912042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
913042217e8SBarry Smith 
9149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
9159566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
9169566063dSJacob Faibussowitsch       PetscCall(PetscFree(colmap));
917042217e8SBarry Smith     }
918042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
919042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
920042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
921042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
92249b994a9SMark Adams     h_mat->M      = A->cmap->N;
923365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
924365b711fSMark Adams     {
925*08401ef6SPierre Jolivet       PetscCheck(sizeof(PetscInt) == 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
92699551766SMark Adams       int        *h_ai32, *h_aj32;
92799551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
9289566063dSJacob Faibussowitsch       PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
9299566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
93099551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
9319566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
93299551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
93399551766SMark Adams 
9349566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
9359566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
9369566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
9379566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
93899551766SMark Adams 
93999551766SMark Adams       h_mat->diag.i = d_ai64;
94099551766SMark Adams       h_mat->diag.j = d_aj64;
94199551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
94299551766SMark Adams 
9439566063dSJacob Faibussowitsch       PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
944365b711fSMark Adams     }
945365b711fSMark Adams #else
94699551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
94799551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
94899551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
949365b711fSMark Adams #endif
950042217e8SBarry Smith     h_mat->diag.a = aa;
951042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
952042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
953042217e8SBarry Smith     // copy pointers and metadata to device
9549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
9559566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
956042217e8SBarry Smith   } else {
9579566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A,"Reusing device matrix\n"));
958042217e8SBarry Smith   }
959042217e8SBarry Smith   *B = d_mat;
960042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9613fa6b06aSMark Adams   PetscFunctionReturn(0);
9623fa6b06aSMark Adams }
963