xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
149371c9d4SSatish Balay struct VecCUDAEquals {
157e8381f9SStefano Zampini   template <typename Tuple>
169371c9d4SSatish Balay   __host__ __device__ void operator()(Tuple t) {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
219371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) {
22cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
23cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
24cbc6b225SStefano Zampini 
25cbc6b225SStefano Zampini   PetscFunctionBegin;
26cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
27cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
299566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
41cbc6b225SStefano Zampini   }
42cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
43cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
44cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
45cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
46cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
47cbc6b225SStefano Zampini   PetscFunctionReturn(0);
48cbc6b225SStefano Zampini }
49cbc6b225SStefano Zampini 
509371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) {
517e8381f9SStefano Zampini   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
527e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
537e8381f9SStefano Zampini   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
547e8381f9SStefano Zampini 
557e8381f9SStefano Zampini   PetscFunctionBegin;
56e61fc153SStefano Zampini   if (cusp->coo_p && v) {
5708391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
5808391a17SStefano Zampini     THRUSTARRAY                          *w = NULL;
5908391a17SStefano Zampini 
6008391a17SStefano Zampini     if (isCudaMem(v)) {
6108391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6208391a17SStefano Zampini     } else {
63e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
64e61fc153SStefano Zampini       w->assign(v, v + n);
659566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
6608391a17SStefano Zampini       d_v = w->data();
6708391a17SStefano Zampini     }
6808391a17SStefano Zampini 
699371c9d4SSatish Balay     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
709371c9d4SSatish Balay     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
719566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
727e8381f9SStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
739566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
74e61fc153SStefano Zampini     delete w;
759566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
769566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
777e8381f9SStefano Zampini   } else {
789566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
799566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
807e8381f9SStefano Zampini   }
817e8381f9SStefano Zampini   PetscFunctionReturn(0);
827e8381f9SStefano Zampini }
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini template <typename Tuple>
859371c9d4SSatish Balay struct IsNotOffDiagT {
867e8381f9SStefano Zampini   PetscInt _cstart, _cend;
877e8381f9SStefano Zampini 
887e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
899371c9d4SSatish Balay   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
929371c9d4SSatish Balay struct IsOffDiag {
937e8381f9SStefano Zampini   PetscInt _cstart, _cend;
947e8381f9SStefano Zampini 
957e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
969371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
977e8381f9SStefano Zampini };
987e8381f9SStefano Zampini 
999371c9d4SSatish Balay struct GlobToLoc {
1007e8381f9SStefano Zampini   PetscInt _start;
1017e8381f9SStefano Zampini 
1027e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) { }
1039371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1069371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) {
1077e8381f9SStefano Zampini   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
1087e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
10982a78a4eSJed Brown   PetscInt               N, *jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
111ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini 
1157e8381f9SStefano Zampini   PetscFunctionBegin;
1169566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->A));
1179566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
1187e8381f9SStefano Zampini 
1199566063dSJacob Faibussowitsch   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1207e8381f9SStefano Zampini   d_i.assign(coo_i, coo_i + n);
1217e8381f9SStefano Zampini   d_j.assign(coo_j, coo_j + n);
1229566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1237e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1247e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1257e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1267e8381f9SStefano Zampini     cusp->coo_p  = new THRUSTINTARRAY(n);
1277e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1287e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
1297e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
1307e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
1317e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
1327e8381f9SStefano Zampini     firstoffd  = mzipp.get_iterator_tuple().get<1>();
1337e8381f9SStefano Zampini   }
1347e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
1357e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
1367e8381f9SStefano Zampini 
1377e8381f9SStefano Zampini   /* from global to local */
1387e8381f9SStefano Zampini   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
1397e8381f9SStefano Zampini   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
1409566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1417e8381f9SStefano Zampini 
1427e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
1449566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1457e8381f9SStefano Zampini   auto o_j = d_j.begin();
1469566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
147ddea5d60SJunchao Zhang   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
1487e8381f9SStefano Zampini   thrust::sort(thrust::device, o_j, d_j.end());
149ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
1509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1517e8381f9SStefano Zampini   noff = thrust::distance(o_j, wit);
152cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(noff, &b->garray));
1549566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1559566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
1569566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
1579566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
1589566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
1592c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
1609566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
1617e8381f9SStefano Zampini 
1629566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
1639566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
1649566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
1659566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A));
1669566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
1679566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
1689566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
1699566063dSJacob Faibussowitsch   PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B));
1707e8381f9SStefano Zampini 
1717e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1729566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
1739566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
1749566063dSJacob Faibussowitsch   PetscCall(PetscFree(jj));
1757e8381f9SStefano Zampini 
1769566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
1779566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
1787e8381f9SStefano Zampini 
1799566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
1809566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
1819566063dSJacob Faibussowitsch   PetscCall(MatSetUpMultiply_MPIAIJ(B));
1827e8381f9SStefano Zampini   PetscFunctionReturn(0);
1837e8381f9SStefano Zampini }
1849ae82921SPaul Mullowney 
1859371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) {
186cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
187219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
188cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
189219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
190219fbbafSJunchao Zhang   PetscInt            rstart, rend;
191219fbbafSJunchao Zhang 
192219fbbafSJunchao Zhang   PetscFunctionBegin;
1939566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
1949566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
195cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
1969566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&mpiaij->colmap));
197cbc6b225SStefano Zampini #else
1989566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
199cbc6b225SStefano Zampini #endif
2009566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
201cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
202cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
2039566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
2049566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
205219fbbafSJunchao Zhang   if (coo_i) {
2069566063dSJacob Faibussowitsch     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
2079566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
208219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
209219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
2109371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
2119371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
2129371c9d4SSatish Balay           break;
2139371c9d4SSatish Balay         }
214219fbbafSJunchao Zhang       }
215219fbbafSJunchao Zhang     }
216219fbbafSJunchao Zhang   }
217219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
2189566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
219219fbbafSJunchao Zhang   if (coo_basic) {
2209566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
221219fbbafSJunchao Zhang   } else {
2229566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
223cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
224cbc6b225SStefano Zampini     /* creates the GPU memory */
2259566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2269566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
227cbc6b225SStefano Zampini     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
228219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
229219fbbafSJunchao Zhang 
230158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
2319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
232219fbbafSJunchao Zhang 
233158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
2349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
235219fbbafSJunchao Zhang 
2369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
2379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
2389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
239219fbbafSJunchao Zhang 
2409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
2419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
2429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
243219fbbafSJunchao Zhang 
2449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
2459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
2469566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
247219fbbafSJunchao Zhang 
248158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
250219fbbafSJunchao Zhang 
251158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253219fbbafSJunchao Zhang 
2549566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2569566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
257219fbbafSJunchao Zhang 
2589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
261219fbbafSJunchao Zhang 
2629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
263219fbbafSJunchao Zhang   }
264219fbbafSJunchao Zhang   PetscFunctionReturn(0);
265219fbbafSJunchao Zhang }
266219fbbafSJunchao Zhang 
2679371c9d4SSatish Balay __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) {
268219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
269219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
270219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
271219fbbafSJunchao Zhang }
272219fbbafSJunchao Zhang 
2739371c9d4SSatish Balay __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) {
274219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
275219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
276158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
277158ec288SJunchao Zhang     PetscScalar sum = 0.0;
278158ec288SJunchao Zhang     if (i < Annz) {
279158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
280158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
281158ec288SJunchao Zhang     } else {
282158ec288SJunchao Zhang       i -= Annz;
283158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
284158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
285158ec288SJunchao Zhang     }
286158ec288SJunchao Zhang   }
287158ec288SJunchao Zhang }
288158ec288SJunchao Zhang 
2899371c9d4SSatish Balay __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) {
290158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
291158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
292158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
293158ec288SJunchao Zhang     if (i < Annz2) {
294158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
295158ec288SJunchao Zhang     } else {
296158ec288SJunchao Zhang       i -= Annz2;
297158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
298158ec288SJunchao Zhang     }
299158ec288SJunchao Zhang   }
300219fbbafSJunchao Zhang }
301219fbbafSJunchao Zhang 
3029371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) {
303219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
304219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
305219fbbafSJunchao Zhang   Mat                 A = mpiaij->A, B = mpiaij->B;
306158ec288SJunchao Zhang   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
307cbc6b225SStefano Zampini   PetscScalar        *Aa, *Ba = NULL;
308219fbbafSJunchao Zhang   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
309219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
310158ec288SJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
311158ec288SJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
312219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
313219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
314219fbbafSJunchao Zhang   PetscMemType        memtype;
315219fbbafSJunchao Zhang 
316219fbbafSJunchao Zhang   PetscFunctionBegin;
317219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
318cbc6b225SStefano Zampini     PetscMPIInt size;
319cbc6b225SStefano Zampini 
3209566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
3219566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
322219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3239566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
3249566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
325219fbbafSJunchao Zhang     }
326219fbbafSJunchao Zhang 
327219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3289566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
3299566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
330219fbbafSJunchao Zhang     } else {
3319566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
332158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
333219fbbafSJunchao Zhang     }
334219fbbafSJunchao Zhang 
335219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
336cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
3379371c9d4SSatish Balay       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
3389371c9d4SSatish Balay       PetscCallCUDA(cudaPeekAtLastError());
339cbc6b225SStefano Zampini     }
340219fbbafSJunchao Zhang 
341219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
342158ec288SJunchao Zhang     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
343219fbbafSJunchao Zhang     /* Add local entries to A and B */
344158ec288SJunchao Zhang     if (Annz + Bnnz > 0) {
345158ec288SJunchao Zhang       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
3469566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
347cbc6b225SStefano Zampini     }
348158ec288SJunchao Zhang     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
349219fbbafSJunchao Zhang 
350219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
351158ec288SJunchao Zhang     if (Annz2 + Bnnz2 > 0) {
352158ec288SJunchao Zhang       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
3539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
354cbc6b225SStefano Zampini     }
355219fbbafSJunchao Zhang 
356219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3579566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
358158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
359219fbbafSJunchao Zhang     } else {
3609566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
361158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
362219fbbafSJunchao Zhang     }
3639566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
364219fbbafSJunchao Zhang   } else {
3659566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
366219fbbafSJunchao Zhang   }
367cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
368219fbbafSJunchao Zhang   PetscFunctionReturn(0);
369219fbbafSJunchao Zhang }
370219fbbafSJunchao Zhang 
3719371c9d4SSatish Balay static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) {
372ed502f03SStefano Zampini   Mat             Ad, Ao;
373ed502f03SStefano Zampini   const PetscInt *cmap;
374ed502f03SStefano Zampini 
375ed502f03SStefano Zampini   PetscFunctionBegin;
3769566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
3779566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
378ed502f03SStefano Zampini   if (glob) {
379ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
380ed502f03SStefano Zampini 
3819566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
3829566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
3839566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
3849566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
385ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
386ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
3879566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
388ed502f03SStefano Zampini   }
389ed502f03SStefano Zampini   PetscFunctionReturn(0);
390ed502f03SStefano Zampini }
391ed502f03SStefano Zampini 
3929371c9d4SSatish Balay PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) {
393bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
394bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
3959ae82921SPaul Mullowney   PetscInt            i;
3969ae82921SPaul Mullowney 
3979ae82921SPaul Mullowney   PetscFunctionBegin;
3989566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
3999566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
4007e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
4019371c9d4SSatish Balay     for (i = 0; i < B->rmap->n; i++) { PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); }
4029ae82921SPaul Mullowney   }
4037e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4049371c9d4SSatish Balay     for (i = 0; i < B->rmap->n; i++) { PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); }
4059ae82921SPaul Mullowney   }
4066a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4079566063dSJacob Faibussowitsch   PetscCall(PetscTableDestroy(&b->colmap));
4086a29ce69SStefano Zampini #else
4099566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
4106a29ce69SStefano Zampini #endif
4119566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
4129566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
4139566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
4146a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4159566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
4166a29ce69SStefano Zampini   if (!b->A) {
4179566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
4189566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
4199566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A));
4206a29ce69SStefano Zampini   }
4216a29ce69SStefano Zampini   if (!b->B) {
4226a29ce69SStefano Zampini     PetscMPIInt size;
4239566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
4249566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
4259566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
4269566063dSJacob Faibussowitsch     PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B));
4279ae82921SPaul Mullowney   }
4289566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
4299566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
4309566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
4319566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
4329566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
4339566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
4349566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
4359566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
4369ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4379ae82921SPaul Mullowney   PetscFunctionReturn(0);
4389ae82921SPaul Mullowney }
439e057df02SPaul Mullowney 
4409371c9d4SSatish Balay PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
4419ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4429ae82921SPaul Mullowney 
4439ae82921SPaul Mullowney   PetscFunctionBegin;
4449566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4459566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
4469566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4479566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
4489ae82921SPaul Mullowney   PetscFunctionReturn(0);
4499ae82921SPaul Mullowney }
450ca45077fSPaul Mullowney 
4519371c9d4SSatish Balay PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) {
4523fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
4533fa6b06aSMark Adams 
4543fa6b06aSMark Adams   PetscFunctionBegin;
4559566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
4569566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
4573fa6b06aSMark Adams   PetscFunctionReturn(0);
4583fa6b06aSMark Adams }
459042217e8SBarry Smith 
4609371c9d4SSatish Balay PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) {
461fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
462fdc842d1SBarry Smith 
463fdc842d1SBarry Smith   PetscFunctionBegin;
4649566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4659566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
4669566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4679566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
468fdc842d1SBarry Smith   PetscFunctionReturn(0);
469fdc842d1SBarry Smith }
470fdc842d1SBarry Smith 
4719371c9d4SSatish Balay PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) {
472ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
473ca45077fSPaul Mullowney 
474ca45077fSPaul Mullowney   PetscFunctionBegin;
4759566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
4769566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
4779566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
4789566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
479ca45077fSPaul Mullowney   PetscFunctionReturn(0);
480ca45077fSPaul Mullowney }
4819ae82921SPaul Mullowney 
4829371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) {
483ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
484bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
485e057df02SPaul Mullowney 
486ca45077fSPaul Mullowney   PetscFunctionBegin;
487e057df02SPaul Mullowney   switch (op) {
4889371c9d4SSatish Balay   case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break;
4899371c9d4SSatish Balay   case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break;
490e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
491e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
492e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
493045c96e1SPaul Mullowney     break;
4949371c9d4SSatish Balay   default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
495045c96e1SPaul Mullowney   }
496ca45077fSPaul Mullowney   PetscFunctionReturn(0);
497ca45077fSPaul Mullowney }
498e057df02SPaul Mullowney 
4999371c9d4SSatish Balay PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) {
500e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
501e057df02SPaul Mullowney   PetscBool                flg;
502a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
503a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
5045fd66863SKarl Rupp 
505e057df02SPaul Mullowney   PetscFunctionBegin;
506d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
507e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
5089371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5099566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
5109371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
5119566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
5129371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5139566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
514e057df02SPaul Mullowney   }
515d0609cedSBarry Smith   PetscOptionsHeadEnd();
516e057df02SPaul Mullowney   PetscFunctionReturn(0);
517e057df02SPaul Mullowney }
518e057df02SPaul Mullowney 
5199371c9d4SSatish Balay PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) {
5203fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
521042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
522042217e8SBarry Smith   PetscObjectState    onnz   = A->nonzerostate;
523042217e8SBarry Smith 
52434d6c7a5SJose E. Roman   PetscFunctionBegin;
5259566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
5269566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
527042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
528042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
529042217e8SBarry Smith 
5309566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
5319566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
53499551766SMark Adams     if (h_mat->allocated_indices) {
5359566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5369566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
53799551766SMark Adams       if (h_mat->offdiag.j) {
5389566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5399566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
54099551766SMark Adams       }
54199551766SMark Adams     }
5429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5439566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
544042217e8SBarry Smith     cusp->deviceMat = NULL;
5453fa6b06aSMark Adams   }
54634d6c7a5SJose E. Roman   PetscFunctionReturn(0);
54734d6c7a5SJose E. Roman }
54834d6c7a5SJose E. Roman 
5499371c9d4SSatish Balay PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) {
5503fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
5513fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
552bbf3fe20SPaul Mullowney 
553bbf3fe20SPaul Mullowney   PetscFunctionBegin;
55428b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
5553fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
556042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
557042217e8SBarry Smith 
5589566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Have device matrix\n"));
5599566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5609566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
56299551766SMark Adams     if (h_mat->allocated_indices) {
5639566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5649566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
56599551766SMark Adams       if (h_mat->offdiag.j) {
5669566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
56899551766SMark Adams       }
56999551766SMark Adams     }
5709566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5719566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
5723fa6b06aSMark Adams   }
573cbc6b225SStefano Zampini   /* Free COO */
5749566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
575acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
5769566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
5779566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
5789566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
5799566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
5809566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
5819566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
5829566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
583bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
584bbf3fe20SPaul Mullowney }
585ca45077fSPaul Mullowney 
5869371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) {
587bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
5883338378cSStefano Zampini   Mat         A;
5899ae82921SPaul Mullowney 
5909ae82921SPaul Mullowney   PetscFunctionBegin;
5919566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
5929566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
5939566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
5943338378cSStefano Zampini   A             = *newmat;
5956f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
5969566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
5979566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
59834136279SStefano Zampini 
599bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
6009566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
6019566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
6029566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
603d98d7c49SStefano Zampini 
604*48a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
6052205254eSKarl Rupp 
60634d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
607bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
608fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
609bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
610bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
611bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6123fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6134e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
6142205254eSKarl Rupp 
6159566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
6169566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6179566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6189566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
6209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
621ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6229566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
623ae48a8d0SStefano Zampini #endif
6249ae82921SPaul Mullowney   PetscFunctionReturn(0);
6259ae82921SPaul Mullowney }
6269ae82921SPaul Mullowney 
6279371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) {
6283338378cSStefano Zampini   PetscFunctionBegin;
6299566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6309566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
6319566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
6323338378cSStefano Zampini   PetscFunctionReturn(0);
6333338378cSStefano Zampini }
6343338378cSStefano Zampini 
6353f9c0db1SPaul Mullowney /*@
6363f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
637e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
6383f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
639e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
640e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
641e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
6429ae82921SPaul Mullowney 
643d083f849SBarry Smith    Collective
644e057df02SPaul Mullowney 
645e057df02SPaul Mullowney    Input Parameters:
646e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
647e057df02SPaul Mullowney .  m - number of rows
648e057df02SPaul Mullowney .  n - number of columns
649e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
650e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
6510298fd71SBarry Smith          (possibly different for each row) or NULL
652e057df02SPaul Mullowney 
653e057df02SPaul Mullowney    Output Parameter:
654e057df02SPaul Mullowney .  A - the matrix
655e057df02SPaul Mullowney 
656e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
657e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
658e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
659e057df02SPaul Mullowney 
660e057df02SPaul Mullowney    Notes:
661e057df02SPaul Mullowney    If nnz is given then nz is ignored
662e057df02SPaul Mullowney 
663e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
664e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
665e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
666e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
667e057df02SPaul Mullowney 
668e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
6690298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
670e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
671e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
672e057df02SPaul Mullowney 
673e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
674e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
675e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
676e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
677e057df02SPaul Mullowney 
678e057df02SPaul Mullowney    Level: intermediate
679e057df02SPaul Mullowney 
680db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
681e057df02SPaul Mullowney @*/
6829371c9d4SSatish Balay PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) {
6839ae82921SPaul Mullowney   PetscMPIInt size;
6849ae82921SPaul Mullowney 
6859ae82921SPaul Mullowney   PetscFunctionBegin;
6869566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
6879566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
6889566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
6899ae82921SPaul Mullowney   if (size > 1) {
6909566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
6919566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
6929ae82921SPaul Mullowney   } else {
6939566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
6949566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
6959ae82921SPaul Mullowney   }
6969ae82921SPaul Mullowney   PetscFunctionReturn(0);
6979ae82921SPaul Mullowney }
6989ae82921SPaul Mullowney 
6993ca39a21SBarry Smith /*MC
7006bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
701e057df02SPaul Mullowney 
7022692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
7032692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
7042692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
7059ae82921SPaul Mullowney 
7069ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
7079ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
7089ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
7099ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7109ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7119ae82921SPaul Mullowney 
7129ae82921SPaul Mullowney    Options Database Keys:
713e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
7148468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7158468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7168468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7179ae82921SPaul Mullowney 
7189ae82921SPaul Mullowney   Level: beginner
7199ae82921SPaul Mullowney 
720db781477SPatrick Sanan  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
7216bb69460SJunchao Zhang M*/
7226bb69460SJunchao Zhang 
7236bb69460SJunchao Zhang /*MC
7246bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
7256bb69460SJunchao Zhang 
7266bb69460SJunchao Zhang   Level: beginner
7276bb69460SJunchao Zhang 
728db781477SPatrick Sanan  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
7299ae82921SPaul Mullowney M*/
7303fa6b06aSMark Adams 
731042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
7329371c9d4SSatish Balay PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) {
733042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
734042217e8SBarry Smith   PetscMPIInt                size;
735042217e8SBarry Smith   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
736042217e8SBarry Smith   PetscScalar               *aa = NULL, *ba = NULL;
73799551766SMark Adams   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
738042217e8SBarry Smith   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
739042217e8SBarry Smith   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
7403fa6b06aSMark Adams 
7413fa6b06aSMark Adams   PetscFunctionBegin;
74228b400f6SJacob Faibussowitsch   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
743042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
7443fa6b06aSMark Adams     *B = NULL;
7453fa6b06aSMark Adams     PetscFunctionReturn(0);
7463fa6b06aSMark Adams   }
7479566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
74899551766SMark Adams   // get jaca
7493fa6b06aSMark Adams   if (size == 1) {
750042217e8SBarry Smith     PetscBool isseqaij;
751042217e8SBarry Smith 
7529566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
753042217e8SBarry Smith     if (isseqaij) {
7543fa6b06aSMark Adams       jaca = (Mat_SeqAIJ *)A->data;
75528b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
756042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
757042217e8SBarry Smith       d_mat           = cusparsestructA->deviceMat;
7589566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
7593fa6b06aSMark Adams     } else {
7603fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
76128b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
762042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
7633fa6b06aSMark Adams       jaca                      = (Mat_SeqAIJ *)aij->A->data;
764042217e8SBarry Smith       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
765042217e8SBarry Smith       d_mat                     = spptr->deviceMat;
7669566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
7673fa6b06aSMark Adams     }
768042217e8SBarry Smith     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
769042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
77028b400f6SJacob Faibussowitsch       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
771042217e8SBarry Smith       matrixA = (CsrMatrix *)matstruct->mat;
772042217e8SBarry Smith       bi      = NULL;
773042217e8SBarry Smith       bj      = NULL;
774042217e8SBarry Smith       ba      = NULL;
775042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
776042217e8SBarry Smith   } else {
777042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
77828b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
779042217e8SBarry Smith     jaca                      = (Mat_SeqAIJ *)aij->A->data;
78099551766SMark Adams     jacb                      = (Mat_SeqAIJ *)aij->B->data;
781042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
7823fa6b06aSMark Adams 
78308401ef6SPierre Jolivet     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
784042217e8SBarry Smith     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
785042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
78608401ef6SPierre Jolivet     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
787042217e8SBarry Smith     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
7889566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
7899566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
790042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
791042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
79228b400f6SJacob Faibussowitsch       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
79328b400f6SJacob Faibussowitsch       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
794042217e8SBarry Smith       matrixA = (CsrMatrix *)matstructA->mat;
795042217e8SBarry Smith       matrixB = (CsrMatrix *)matstructB->mat;
7963fa6b06aSMark Adams       if (jacb->compressedrow.use) {
797042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
798042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
799042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
8003fa6b06aSMark Adams         }
801042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
802042217e8SBarry Smith       } else {
803042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
804042217e8SBarry Smith       }
805042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
806042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
807042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
808042217e8SBarry Smith     d_mat = spptr->deviceMat;
809042217e8SBarry Smith   }
8103fa6b06aSMark Adams   if (jaca->compressedrow.use) {
811042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
812042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
813042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
8143fa6b06aSMark Adams     }
815042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8163fa6b06aSMark Adams   } else {
817042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8183fa6b06aSMark Adams   }
819042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
820042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
821042217e8SBarry Smith 
822042217e8SBarry Smith   if (!d_mat) {
823042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
824042217e8SBarry Smith 
825042217e8SBarry Smith     // create and populate strucy on host and copy on device
8269566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Create device matrix\n"));
8279566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
8289566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
829042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
830042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
83199551766SMark Adams       PetscInt   *colmap;
832042217e8SBarry Smith       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
833042217e8SBarry Smith 
83408401ef6SPierre Jolivet       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
835042217e8SBarry Smith 
8369566063dSJacob Faibussowitsch       PetscCall(PetscCalloc1(N + 1, &colmap));
837042217e8SBarry Smith       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
838365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
839365b711fSMark Adams       { // have to make a long version of these
84099551766SMark Adams         int      *h_bi32, *h_bj32;
84199551766SMark Adams         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
8429566063dSJacob Faibussowitsch         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
8439566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
84499551766SMark Adams         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
8459566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
84699551766SMark Adams         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
84799551766SMark Adams 
8489566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
8499566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
8509566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
8519566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
85299551766SMark Adams 
85399551766SMark Adams         h_mat->offdiag.i         = d_bi64;
85499551766SMark Adams         h_mat->offdiag.j         = d_bj64;
85599551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
85699551766SMark Adams 
8579566063dSJacob Faibussowitsch         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
858365b711fSMark Adams       }
859365b711fSMark Adams #else
86099551766SMark Adams       h_mat->offdiag.i         = (PetscInt *)bi;
86199551766SMark Adams       h_mat->offdiag.j         = (PetscInt *)bj;
86299551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
863365b711fSMark Adams #endif
864042217e8SBarry Smith       h_mat->offdiag.a = ba;
865042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
866042217e8SBarry Smith 
8679566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
8689566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
8699566063dSJacob Faibussowitsch       PetscCall(PetscFree(colmap));
870042217e8SBarry Smith     }
871042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
872042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
873042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
874042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
87549b994a9SMark Adams     h_mat->M      = A->cmap->N;
876365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
877365b711fSMark Adams     {
87899551766SMark Adams       int      *h_ai32, *h_aj32;
87999551766SMark Adams       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
88063a3b9bcSJacob Faibussowitsch 
88163a3b9bcSJacob Faibussowitsch       static_assert(sizeof(PetscInt) == 8, "");
8829566063dSJacob Faibussowitsch       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
8839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
88499551766SMark Adams       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
8859566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
88699551766SMark Adams       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
88799551766SMark Adams 
8889566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
8899566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
8909566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
8919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
89299551766SMark Adams 
89399551766SMark Adams       h_mat->diag.i            = d_ai64;
89499551766SMark Adams       h_mat->diag.j            = d_aj64;
89599551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
89699551766SMark Adams 
8979566063dSJacob Faibussowitsch       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
898365b711fSMark Adams     }
899365b711fSMark Adams #else
90099551766SMark Adams     h_mat->diag.i            = (PetscInt *)ai;
90199551766SMark Adams     h_mat->diag.j            = (PetscInt *)aj;
90299551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
903365b711fSMark Adams #endif
904042217e8SBarry Smith     h_mat->diag.a = aa;
905042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
906042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
907042217e8SBarry Smith     // copy pointers and metadata to device
9089566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
9099566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
910042217e8SBarry Smith   } else {
9119566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
912042217e8SBarry Smith   }
913042217e8SBarry Smith   *B             = d_mat;
914042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9153fa6b06aSMark Adams   PetscFunctionReturn(0);
9163fa6b06aSMark Adams }
917