xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision eec179cf895b6fbcd6a0b58694319392b06e361b)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
149371c9d4SSatish Balay struct VecCUDAEquals {
157e8381f9SStefano Zampini   template <typename Tuple>
16d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
17d71ae5a4SJacob Faibussowitsch   {
187e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
197e8381f9SStefano Zampini   }
207e8381f9SStefano Zampini };
217e8381f9SStefano Zampini 
22d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
23d71ae5a4SJacob Faibussowitsch {
24cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)mat->data;
25cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
26cbc6b225SStefano Zampini 
27cbc6b225SStefano Zampini   PetscFunctionBegin;
28cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
29cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
309566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d));
319566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d));
329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d));
339566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d));
349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d));
359566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d));
369566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d));
379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d));
389566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d));
399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d));
409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d));
419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d));
429566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d));
43cbc6b225SStefano Zampini   }
44cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
45cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
46cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
47cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
48cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
49cbc6b225SStefano Zampini   PetscFunctionReturn(0);
50cbc6b225SStefano Zampini }
51cbc6b225SStefano Zampini 
52d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
53d71ae5a4SJacob Faibussowitsch {
547e8381f9SStefano Zampini   Mat_MPIAIJ         *a    = (Mat_MPIAIJ *)A->data;
557e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr;
567e8381f9SStefano Zampini   PetscInt            n    = cusp->coo_nd + cusp->coo_no;
577e8381f9SStefano Zampini 
587e8381f9SStefano Zampini   PetscFunctionBegin;
59e61fc153SStefano Zampini   if (cusp->coo_p && v) {
6008391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6108391a17SStefano Zampini     THRUSTARRAY                          *w = NULL;
6208391a17SStefano Zampini 
6308391a17SStefano Zampini     if (isCudaMem(v)) {
6408391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6508391a17SStefano Zampini     } else {
66e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
67e61fc153SStefano Zampini       w->assign(v, v + n);
689566063dSJacob Faibussowitsch       PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar)));
6908391a17SStefano Zampini       d_v = w->data();
7008391a17SStefano Zampini     }
7108391a17SStefano Zampini 
729371c9d4SSatish Balay     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin()));
739371c9d4SSatish Balay     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end()));
749566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeBegin());
757e8381f9SStefano Zampini     thrust::for_each(zibit, zieit, VecCUDAEquals());
769566063dSJacob Faibussowitsch     PetscCall(PetscLogGpuTimeEnd());
77e61fc153SStefano Zampini     delete w;
789566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode));
799566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode));
807e8381f9SStefano Zampini   } else {
819566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode));
829566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode));
837e8381f9SStefano Zampini   }
847e8381f9SStefano Zampini   PetscFunctionReturn(0);
857e8381f9SStefano Zampini }
867e8381f9SStefano Zampini 
877e8381f9SStefano Zampini template <typename Tuple>
889371c9d4SSatish Balay struct IsNotOffDiagT {
897e8381f9SStefano Zampini   PetscInt _cstart, _cend;
907e8381f9SStefano Zampini 
917e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
929371c9d4SSatish Balay   __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); }
937e8381f9SStefano Zampini };
947e8381f9SStefano Zampini 
959371c9d4SSatish Balay struct IsOffDiag {
967e8381f9SStefano Zampini   PetscInt _cstart, _cend;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { }
999371c9d4SSatish Balay   __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; }
1007e8381f9SStefano Zampini };
1017e8381f9SStefano Zampini 
1029371c9d4SSatish Balay struct GlobToLoc {
1037e8381f9SStefano Zampini   PetscInt _start;
1047e8381f9SStefano Zampini 
1057e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) { }
1069371c9d4SSatish Balay   __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; }
1077e8381f9SStefano Zampini };
1087e8381f9SStefano Zampini 
109d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[])
110d71ae5a4SJacob Faibussowitsch {
1117e8381f9SStefano Zampini   Mat_MPIAIJ            *b    = (Mat_MPIAIJ *)B->data;
1127e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE    *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr;
11382a78a4eSJed Brown   PetscInt               N, *jj;
1147e8381f9SStefano Zampini   size_t                 noff = 0;
115ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1167e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1177e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1187e8381f9SStefano Zampini 
1197e8381f9SStefano Zampini   PetscFunctionBegin;
1209566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->A));
1219566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
1227e8381f9SStefano Zampini 
1239566063dSJacob Faibussowitsch   PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt)));
1247e8381f9SStefano Zampini   d_i.assign(coo_i, coo_i + n);
1257e8381f9SStefano Zampini   d_j.assign(coo_j, coo_j + n);
1269566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
1277e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1287e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend));
1297e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1307e8381f9SStefano Zampini     cusp->coo_p  = new THRUSTINTARRAY(n);
1317e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1327e8381f9SStefano Zampini     thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0);
1337e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin()));
1347e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end()));
1357e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend));
1367e8381f9SStefano Zampini     firstoffd  = mzipp.get_iterator_tuple().get<1>();
1377e8381f9SStefano Zampini   }
1387e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd);
1397e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd, d_j.end());
1407e8381f9SStefano Zampini 
1417e8381f9SStefano Zampini   /* from global to local */
1427e8381f9SStefano Zampini   thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart));
1437e8381f9SStefano Zampini   thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart));
1449566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1457e8381f9SStefano Zampini 
1467e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1479566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */
1489566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1497e8381f9SStefano Zampini   auto o_j = d_j.begin();
1509566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeBegin());
151ddea5d60SJunchao Zhang   thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */
1527e8381f9SStefano Zampini   thrust::sort(thrust::device, o_j, d_j.end());
153ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */
1549566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuTimeEnd());
1557e8381f9SStefano Zampini   noff = thrust::distance(o_j, wit);
156cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1579566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(noff, &b->garray));
1589566063dSJacob Faibussowitsch   PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost));
1599566063dSJacob Faibussowitsch   PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt)));
1609566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g));
1619566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH));
1629566063dSJacob Faibussowitsch   PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj));
1632c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no);
1649566063dSJacob Faibussowitsch   PetscCall(ISLocalToGlobalMappingDestroy(&l2g));
1657e8381f9SStefano Zampini 
1669566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
1679566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
1689566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
1699566063dSJacob Faibussowitsch   PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
1709566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff));
1719566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
1727e8381f9SStefano Zampini 
1737e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1749566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get()));
1759566063dSJacob Faibussowitsch   PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj));
1769566063dSJacob Faibussowitsch   PetscCall(PetscFree(jj));
1777e8381f9SStefano Zampini 
1789566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat));
1799566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat));
1807e8381f9SStefano Zampini 
1819566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
1829566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
1839566063dSJacob Faibussowitsch   PetscCall(MatSetUpMultiply_MPIAIJ(B));
1847e8381f9SStefano Zampini   PetscFunctionReturn(0);
1857e8381f9SStefano Zampini }
1869ae82921SPaul Mullowney 
187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
188d71ae5a4SJacob Faibussowitsch {
189cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)mat->data;
190219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
191cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
192219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
193219fbbafSJunchao Zhang   PetscInt            rstart, rend;
194219fbbafSJunchao Zhang 
195219fbbafSJunchao Zhang   PetscFunctionBegin;
1969566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
1979566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
198cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
199*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
200cbc6b225SStefano Zampini #else
2019566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
202cbc6b225SStefano Zampini #endif
2039566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
204cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
205cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
2069566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJ(mat));
2079566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
208219fbbafSJunchao Zhang   if (coo_i) {
2099566063dSJacob Faibussowitsch     PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend));
2109566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(coo_i, &mtype));
211219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
212219fbbafSJunchao Zhang       for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */
2139371c9d4SSatish Balay         if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) {
2149371c9d4SSatish Balay           coo_basic = PETSC_FALSE;
2159371c9d4SSatish Balay           break;
2169371c9d4SSatish Balay         }
217219fbbafSJunchao Zhang       }
218219fbbafSJunchao Zhang     }
219219fbbafSJunchao Zhang   }
220219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
2219566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat)));
222219fbbafSJunchao Zhang   if (coo_basic) {
2239566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j));
224219fbbafSJunchao Zhang   } else {
2259566063dSJacob Faibussowitsch     PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j));
226cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
227cbc6b225SStefano Zampini     /* creates the GPU memory */
2289566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
230cbc6b225SStefano Zampini     mpidev                   = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
231219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
232219fbbafSJunchao Zhang 
233158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount)));
2349566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount)));
235219fbbafSJunchao Zhang 
236158ec288SJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount)));
2379566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount)));
238219fbbafSJunchao Zhang 
2399566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount)));
2409566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount)));
2419566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount)));
242219fbbafSJunchao Zhang 
2439566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount)));
2449566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount)));
2459566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount)));
246219fbbafSJunchao Zhang 
2479566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount)));
2489566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar)));
2499566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar)));
250219fbbafSJunchao Zhang 
251158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
253219fbbafSJunchao Zhang 
254158ec288SJunchao Zhang     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2559566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
256219fbbafSJunchao Zhang 
2579566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2589566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2599566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
260219fbbafSJunchao Zhang 
2619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
2629566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
2639566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
264219fbbafSJunchao Zhang 
2659566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
266219fbbafSJunchao Zhang   }
267219fbbafSJunchao Zhang   PetscFunctionReturn(0);
268219fbbafSJunchao Zhang }
269219fbbafSJunchao Zhang 
270d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
271d71ae5a4SJacob Faibussowitsch {
272219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
273219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
274219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
275219fbbafSJunchao Zhang }
276219fbbafSJunchao Zhang 
277d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
278d71ae5a4SJacob Faibussowitsch {
279219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
280219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
281158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
282158ec288SJunchao Zhang     PetscScalar sum = 0.0;
283158ec288SJunchao Zhang     if (i < Annz) {
284158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
285158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
286158ec288SJunchao Zhang     } else {
287158ec288SJunchao Zhang       i -= Annz;
288158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
289158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
290158ec288SJunchao Zhang     }
291158ec288SJunchao Zhang   }
292158ec288SJunchao Zhang }
293158ec288SJunchao Zhang 
294d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
295d71ae5a4SJacob Faibussowitsch {
296158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
297158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
298158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
299158ec288SJunchao Zhang     if (i < Annz2) {
300158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
301158ec288SJunchao Zhang     } else {
302158ec288SJunchao Zhang       i -= Annz2;
303158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
304158ec288SJunchao Zhang     }
305158ec288SJunchao Zhang   }
306219fbbafSJunchao Zhang }
307219fbbafSJunchao Zhang 
308d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
309d71ae5a4SJacob Faibussowitsch {
310219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
311219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr);
312219fbbafSJunchao Zhang   Mat                 A = mpiaij->A, B = mpiaij->B;
313158ec288SJunchao Zhang   PetscCount          Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2;
314cbc6b225SStefano Zampini   PetscScalar        *Aa, *Ba = NULL;
315219fbbafSJunchao Zhang   PetscScalar        *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d;
316219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
317158ec288SJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d;
318158ec288SJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d;
319219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d;
320219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
321219fbbafSJunchao Zhang   PetscMemType        memtype;
322219fbbafSJunchao Zhang 
323219fbbafSJunchao Zhang   PetscFunctionBegin;
324219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
325cbc6b225SStefano Zampini     PetscMPIInt size;
326cbc6b225SStefano Zampini 
3279566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size));
3289566063dSJacob Faibussowitsch     PetscCall(PetscGetMemType(v, &memtype));
329219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3309566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar)));
3319566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
332219fbbafSJunchao Zhang     }
333219fbbafSJunchao Zhang 
334219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3359566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
3369566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
337219fbbafSJunchao Zhang     } else {
3389566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
339158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
340219fbbafSJunchao Zhang     }
341219fbbafSJunchao Zhang 
342219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
343cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
3449371c9d4SSatish Balay       MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend);
3459371c9d4SSatish Balay       PetscCallCUDA(cudaPeekAtLastError());
346cbc6b225SStefano Zampini     }
347219fbbafSJunchao Zhang 
348219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
349158ec288SJunchao Zhang     PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
350219fbbafSJunchao Zhang     /* Add local entries to A and B */
351158ec288SJunchao Zhang     if (Annz + Bnnz > 0) {
352158ec288SJunchao Zhang       MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
3539566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
354cbc6b225SStefano Zampini     }
355158ec288SJunchao Zhang     PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
356219fbbafSJunchao Zhang 
357219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
358158ec288SJunchao Zhang     if (Annz2 + Bnnz2 > 0) {
359158ec288SJunchao Zhang       MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
3609566063dSJacob Faibussowitsch       PetscCallCUDA(cudaPeekAtLastError());
361cbc6b225SStefano Zampini     }
362219fbbafSJunchao Zhang 
363219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3649566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
365158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
366219fbbafSJunchao Zhang     } else {
3679566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
368158ec288SJunchao Zhang       PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
369219fbbafSJunchao Zhang     }
3709566063dSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
371219fbbafSJunchao Zhang   } else {
3729566063dSJacob Faibussowitsch     PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode));
373219fbbafSJunchao Zhang   }
374cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
375219fbbafSJunchao Zhang   PetscFunctionReturn(0);
376219fbbafSJunchao Zhang }
377219fbbafSJunchao Zhang 
378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
379d71ae5a4SJacob Faibussowitsch {
380ed502f03SStefano Zampini   Mat             Ad, Ao;
381ed502f03SStefano Zampini   const PetscInt *cmap;
382ed502f03SStefano Zampini 
383ed502f03SStefano Zampini   PetscFunctionBegin;
3849566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
3859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
386ed502f03SStefano Zampini   if (glob) {
387ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
388ed502f03SStefano Zampini 
3899566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
3909566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
3919566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
3929566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
393ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
394ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
3959566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
396ed502f03SStefano Zampini   }
397ed502f03SStefano Zampini   PetscFunctionReturn(0);
398ed502f03SStefano Zampini }
399ed502f03SStefano Zampini 
400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
401d71ae5a4SJacob Faibussowitsch {
402bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
403bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
4049ae82921SPaul Mullowney   PetscInt            i;
4059ae82921SPaul Mullowney 
4069ae82921SPaul Mullowney   PetscFunctionBegin;
4079566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
4089566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
4097e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
410ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
4119ae82921SPaul Mullowney   }
4127e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
413ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
4149ae82921SPaul Mullowney   }
4156a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
416*eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
4176a29ce69SStefano Zampini #else
4189566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
4196a29ce69SStefano Zampini #endif
4209566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
4219566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
4229566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
4236a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4249566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
4256a29ce69SStefano Zampini   if (!b->A) {
4269566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
4279566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
4286a29ce69SStefano Zampini   }
4296a29ce69SStefano Zampini   if (!b->B) {
4306a29ce69SStefano Zampini     PetscMPIInt size;
4319566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
4329566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
4339566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
4349ae82921SPaul Mullowney   }
4359566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
4369566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
4379566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
4389566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
4399566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
4409566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
4419566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
4429566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
4439ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4449ae82921SPaul Mullowney   PetscFunctionReturn(0);
4459ae82921SPaul Mullowney }
446e057df02SPaul Mullowney 
447d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
448d71ae5a4SJacob Faibussowitsch {
4499ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
4509ae82921SPaul Mullowney 
4519ae82921SPaul Mullowney   PetscFunctionBegin;
4529566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4539566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
4549566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4559566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
4569ae82921SPaul Mullowney   PetscFunctionReturn(0);
4579ae82921SPaul Mullowney }
458ca45077fSPaul Mullowney 
459d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
460d71ae5a4SJacob Faibussowitsch {
4613fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
4623fa6b06aSMark Adams 
4633fa6b06aSMark Adams   PetscFunctionBegin;
4649566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
4659566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
4663fa6b06aSMark Adams   PetscFunctionReturn(0);
4673fa6b06aSMark Adams }
468042217e8SBarry Smith 
469d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
470d71ae5a4SJacob Faibussowitsch {
471fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
472fdc842d1SBarry Smith 
473fdc842d1SBarry Smith   PetscFunctionBegin;
4749566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4759566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
4769566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
4779566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
478fdc842d1SBarry Smith   PetscFunctionReturn(0);
479fdc842d1SBarry Smith }
480fdc842d1SBarry Smith 
481d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
482d71ae5a4SJacob Faibussowitsch {
483ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
484ca45077fSPaul Mullowney 
485ca45077fSPaul Mullowney   PetscFunctionBegin;
4869566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
4879566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
4889566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
4899566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
490ca45077fSPaul Mullowney   PetscFunctionReturn(0);
491ca45077fSPaul Mullowney }
4929ae82921SPaul Mullowney 
493d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
494d71ae5a4SJacob Faibussowitsch {
495ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
496bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
497e057df02SPaul Mullowney 
498ca45077fSPaul Mullowney   PetscFunctionBegin;
499e057df02SPaul Mullowney   switch (op) {
500d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
501d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
502d71ae5a4SJacob Faibussowitsch     break;
503d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
504d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
505d71ae5a4SJacob Faibussowitsch     break;
506e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
507e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
508e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
509045c96e1SPaul Mullowney     break;
510d71ae5a4SJacob Faibussowitsch   default:
511d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
512045c96e1SPaul Mullowney   }
513ca45077fSPaul Mullowney   PetscFunctionReturn(0);
514ca45077fSPaul Mullowney }
515e057df02SPaul Mullowney 
516d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
517d71ae5a4SJacob Faibussowitsch {
518e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
519e057df02SPaul Mullowney   PetscBool                flg;
520a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
521a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
5225fd66863SKarl Rupp 
523e057df02SPaul Mullowney   PetscFunctionBegin;
524d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
525e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
5269371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5279566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
5289371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
5299566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
5309371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
5319566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
532e057df02SPaul Mullowney   }
533d0609cedSBarry Smith   PetscOptionsHeadEnd();
534e057df02SPaul Mullowney   PetscFunctionReturn(0);
535e057df02SPaul Mullowney }
536e057df02SPaul Mullowney 
537d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
538d71ae5a4SJacob Faibussowitsch {
5393fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ *)A->data;
540042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp   = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr;
541042217e8SBarry Smith   PetscObjectState    onnz   = A->nonzerostate;
542042217e8SBarry Smith 
54334d6c7a5SJose E. Roman   PetscFunctionBegin;
5449566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
5459566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
546042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
547042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
548042217e8SBarry Smith 
5499566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n"));
5509566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5519566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
55399551766SMark Adams     if (h_mat->allocated_indices) {
5549566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5559566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
55699551766SMark Adams       if (h_mat->offdiag.j) {
5579566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5589566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
55999551766SMark Adams       }
56099551766SMark Adams     }
5619566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5629566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
563042217e8SBarry Smith     cusp->deviceMat = NULL;
5643fa6b06aSMark Adams   }
56534d6c7a5SJose E. Roman   PetscFunctionReturn(0);
56634d6c7a5SJose E. Roman }
56734d6c7a5SJose E. Roman 
568d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
569d71ae5a4SJacob Faibussowitsch {
5703fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
5713fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
572bbf3fe20SPaul Mullowney 
573bbf3fe20SPaul Mullowney   PetscFunctionBegin;
57428b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
5753fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
576042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
577042217e8SBarry Smith 
5789566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Have device matrix\n"));
5799566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
5809566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost));
5819566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(h_mat->colmap));
58299551766SMark Adams     if (h_mat->allocated_indices) {
5839566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.i));
5849566063dSJacob Faibussowitsch       PetscCallCUDA(cudaFree(h_mat->diag.j));
58599551766SMark Adams       if (h_mat->offdiag.j) {
5869566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.i));
5879566063dSJacob Faibussowitsch         PetscCallCUDA(cudaFree(h_mat->offdiag.j));
58899551766SMark Adams       }
58999551766SMark Adams     }
5909566063dSJacob Faibussowitsch     PetscCallCUDA(cudaFree(d_mat));
5919566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
5923fa6b06aSMark Adams   }
593cbc6b225SStefano Zampini   /* Free COO */
5949566063dSJacob Faibussowitsch   PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
595acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
5969566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
5979566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
5989566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
5999566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
6009566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
6019566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
6029566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
603bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
604bbf3fe20SPaul Mullowney }
605ca45077fSPaul Mullowney 
606d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat)
607d71ae5a4SJacob Faibussowitsch {
608bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
6093338378cSStefano Zampini   Mat         A;
6109ae82921SPaul Mullowney 
6119ae82921SPaul Mullowney   PetscFunctionBegin;
6129566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6139566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
6149566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
6153338378cSStefano Zampini   A             = *newmat;
6166f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6179566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
6189566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
61934136279SStefano Zampini 
620bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
6219566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
6229566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
6239566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
624d98d7c49SStefano Zampini 
62548a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
6262205254eSKarl Rupp 
62734d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
628bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
629fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
630bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
631bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
632bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6333fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6344e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
6352205254eSKarl Rupp 
6369566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
6379566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6389566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6399566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6409566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
6419566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
642ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6439566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
644ae48a8d0SStefano Zampini #endif
6459ae82921SPaul Mullowney   PetscFunctionReturn(0);
6469ae82921SPaul Mullowney }
6479ae82921SPaul Mullowney 
648d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
649d71ae5a4SJacob Faibussowitsch {
6503338378cSStefano Zampini   PetscFunctionBegin;
6519566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6529566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
6539566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
6543338378cSStefano Zampini   PetscFunctionReturn(0);
6553338378cSStefano Zampini }
6563338378cSStefano Zampini 
6573f9c0db1SPaul Mullowney /*@
65811a5261eSBarry Smith    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
659e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
66011a5261eSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix
661e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
662e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
663e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
6649ae82921SPaul Mullowney 
665d083f849SBarry Smith    Collective
666e057df02SPaul Mullowney 
667e057df02SPaul Mullowney    Input Parameters:
66811a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
669e057df02SPaul Mullowney .  m - number of rows
670e057df02SPaul Mullowney .  n - number of columns
671e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
672e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
6730298fd71SBarry Smith          (possibly different for each row) or NULL
674e057df02SPaul Mullowney 
675e057df02SPaul Mullowney    Output Parameter:
676e057df02SPaul Mullowney .  A - the matrix
677e057df02SPaul Mullowney 
67811a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
679e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
68011a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
681e057df02SPaul Mullowney 
682e057df02SPaul Mullowney    Notes:
683e057df02SPaul Mullowney    If nnz is given then nz is ignored
684e057df02SPaul Mullowney 
68511a5261eSBarry Smith    The AIJ format, also called the
686e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
687e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
688e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
689e057df02SPaul Mullowney 
690e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
69111a5261eSBarry Smith    Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory
692e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
693e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
694e057df02SPaul Mullowney 
695e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
696e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
697e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
698e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
699e057df02SPaul Mullowney 
700e057df02SPaul Mullowney    Level: intermediate
701e057df02SPaul Mullowney 
70211a5261eSBarry Smith .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
703e057df02SPaul Mullowney @*/
704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
705d71ae5a4SJacob Faibussowitsch {
7069ae82921SPaul Mullowney   PetscMPIInt size;
7079ae82921SPaul Mullowney 
7089ae82921SPaul Mullowney   PetscFunctionBegin;
7099566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
7109566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
7119566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
7129ae82921SPaul Mullowney   if (size > 1) {
7139566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
7149566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
7159ae82921SPaul Mullowney   } else {
7169566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
7179566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
7189ae82921SPaul Mullowney   }
7199ae82921SPaul Mullowney   PetscFunctionReturn(0);
7209ae82921SPaul Mullowney }
7219ae82921SPaul Mullowney 
7223ca39a21SBarry Smith /*MC
72311a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
724e057df02SPaul Mullowney 
72511a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
7262692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
72711a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
7289ae82921SPaul Mullowney 
72911a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
73011a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
73111a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
7329ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7339ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7349ae82921SPaul Mullowney 
7359ae82921SPaul Mullowney    Options Database Keys:
73611a5261eSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()`
7378468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
73811a5261eSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
73911a5261eSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid).
7409ae82921SPaul Mullowney 
7419ae82921SPaul Mullowney   Level: beginner
7429ae82921SPaul Mullowney 
743db781477SPatrick Sanan  .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
7446bb69460SJunchao Zhang M*/
7456bb69460SJunchao Zhang 
7466bb69460SJunchao Zhang /*MC
74711a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
7486bb69460SJunchao Zhang 
7496bb69460SJunchao Zhang   Level: beginner
7506bb69460SJunchao Zhang 
751db781477SPatrick Sanan  .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
7529ae82921SPaul Mullowney M*/
7533fa6b06aSMark Adams 
754042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
755d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
756d71ae5a4SJacob Faibussowitsch {
757042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
758042217e8SBarry Smith   PetscMPIInt                size;
759042217e8SBarry Smith   int                       *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL;
760042217e8SBarry Smith   PetscScalar               *aa = NULL, *ba = NULL;
76199551766SMark Adams   Mat_SeqAIJ                *jaca = NULL, *jacb = NULL;
762042217e8SBarry Smith   Mat_SeqAIJCUSPARSE        *cusparsestructA = NULL;
763042217e8SBarry Smith   CsrMatrix                 *matrixA = NULL, *matrixB = NULL;
7643fa6b06aSMark Adams 
7653fa6b06aSMark Adams   PetscFunctionBegin;
76628b400f6SJacob Faibussowitsch   PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix");
767042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
7683fa6b06aSMark Adams     *B = NULL;
7693fa6b06aSMark Adams     PetscFunctionReturn(0);
7703fa6b06aSMark Adams   }
7719566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size));
77299551766SMark Adams   // get jaca
7733fa6b06aSMark Adams   if (size == 1) {
774042217e8SBarry Smith     PetscBool isseqaij;
775042217e8SBarry Smith 
7769566063dSJacob Faibussowitsch     PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij));
777042217e8SBarry Smith     if (isseqaij) {
7783fa6b06aSMark Adams       jaca = (Mat_SeqAIJ *)A->data;
77928b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
780042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr;
781042217e8SBarry Smith       d_mat           = cusparsestructA->deviceMat;
7829566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(A));
7833fa6b06aSMark Adams     } else {
7843fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
78528b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
786042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
7873fa6b06aSMark Adams       jaca                      = (Mat_SeqAIJ *)aij->A->data;
788042217e8SBarry Smith       cusparsestructA           = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
789042217e8SBarry Smith       d_mat                     = spptr->deviceMat;
7909566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
7913fa6b06aSMark Adams     }
792042217e8SBarry Smith     if (cusparsestructA->format == MAT_CUSPARSE_CSR) {
793042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
79428b400f6SJacob Faibussowitsch       PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
795042217e8SBarry Smith       matrixA = (CsrMatrix *)matstruct->mat;
796042217e8SBarry Smith       bi      = NULL;
797042217e8SBarry Smith       bj      = NULL;
798042217e8SBarry Smith       ba      = NULL;
799042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR");
800042217e8SBarry Smith   } else {
801042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
80228b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion");
803042217e8SBarry Smith     jaca                      = (Mat_SeqAIJ *)aij->A->data;
80499551766SMark Adams     jacb                      = (Mat_SeqAIJ *)aij->B->data;
805042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr;
8063fa6b06aSMark Adams 
80708401ef6SPierre Jolivet     PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
808042217e8SBarry Smith     cusparsestructA                     = (Mat_SeqAIJCUSPARSE *)aij->A->spptr;
809042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr;
81008401ef6SPierre Jolivet     PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR");
811042217e8SBarry Smith     if (cusparsestructB->format == MAT_CUSPARSE_CSR) {
8129566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8139566063dSJacob Faibussowitsch       PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B));
814042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat;
815042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat;
81628b400f6SJacob Faibussowitsch       PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A");
81728b400f6SJacob Faibussowitsch       PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B");
818042217e8SBarry Smith       matrixA = (CsrMatrix *)matstructA->mat;
819042217e8SBarry Smith       matrixB = (CsrMatrix *)matstructB->mat;
8203fa6b06aSMark Adams       if (jacb->compressedrow.use) {
821042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
822042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
823042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1);
8243fa6b06aSMark Adams         }
825042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
826042217e8SBarry Smith       } else {
827042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
828042217e8SBarry Smith       }
829042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
830042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
831042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR");
832042217e8SBarry Smith     d_mat = spptr->deviceMat;
833042217e8SBarry Smith   }
8343fa6b06aSMark Adams   if (jaca->compressedrow.use) {
835042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
836042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1);
837042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1);
8383fa6b06aSMark Adams     }
839042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8403fa6b06aSMark Adams   } else {
841042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8423fa6b06aSMark Adams   }
843042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
844042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
845042217e8SBarry Smith 
846042217e8SBarry Smith   if (!d_mat) {
847042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
848042217e8SBarry Smith 
849042217e8SBarry Smith     // create and populate strucy on host and copy on device
8509566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Create device matrix\n"));
8519566063dSJacob Faibussowitsch     PetscCall(PetscNew(&h_mat));
8529566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat)));
853042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
854042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
85599551766SMark Adams       PetscInt   *colmap;
856042217e8SBarry Smith       PetscInt    ii, n = aij->B->cmap->n, N = A->cmap->N;
857042217e8SBarry Smith 
85808401ef6SPierre Jolivet       PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray");
859042217e8SBarry Smith 
8609566063dSJacob Faibussowitsch       PetscCall(PetscCalloc1(N + 1, &colmap));
861042217e8SBarry Smith       for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1);
862365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
863365b711fSMark Adams       { // have to make a long version of these
86499551766SMark Adams         int      *h_bi32, *h_bj32;
86599551766SMark Adams         PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64;
8669566063dSJacob Faibussowitsch         PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64));
8679566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost));
86899551766SMark Adams         for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i];
8699566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost));
87099551766SMark Adams         for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i];
87199551766SMark Adams 
8729566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64)));
8739566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice));
8749566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64)));
8759566063dSJacob Faibussowitsch         PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice));
87699551766SMark Adams 
87799551766SMark Adams         h_mat->offdiag.i         = d_bi64;
87899551766SMark Adams         h_mat->offdiag.j         = d_bj64;
87999551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
88099551766SMark Adams 
8819566063dSJacob Faibussowitsch         PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64));
882365b711fSMark Adams       }
883365b711fSMark Adams #else
88499551766SMark Adams       h_mat->offdiag.i         = (PetscInt *)bi;
88599551766SMark Adams       h_mat->offdiag.j         = (PetscInt *)bj;
88699551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
887365b711fSMark Adams #endif
888042217e8SBarry Smith       h_mat->offdiag.a = ba;
889042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
890042217e8SBarry Smith 
8919566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap)));
8929566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice));
8939566063dSJacob Faibussowitsch       PetscCall(PetscFree(colmap));
894042217e8SBarry Smith     }
895042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
896042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
897042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
898042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
89949b994a9SMark Adams     h_mat->M      = A->cmap->N;
900365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
901365b711fSMark Adams     {
90299551766SMark Adams       int      *h_ai32, *h_aj32;
90399551766SMark Adams       PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64;
90463a3b9bcSJacob Faibussowitsch 
90563a3b9bcSJacob Faibussowitsch       static_assert(sizeof(PetscInt) == 8, "");
9069566063dSJacob Faibussowitsch       PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64));
9079566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost));
90899551766SMark Adams       for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i];
9099566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost));
91099551766SMark Adams       for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i];
91199551766SMark Adams 
9129566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64)));
9139566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice));
9149566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64)));
9159566063dSJacob Faibussowitsch       PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice));
91699551766SMark Adams 
91799551766SMark Adams       h_mat->diag.i            = d_ai64;
91899551766SMark Adams       h_mat->diag.j            = d_aj64;
91999551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
92099551766SMark Adams 
9219566063dSJacob Faibussowitsch       PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64));
922365b711fSMark Adams     }
923365b711fSMark Adams #else
92499551766SMark Adams     h_mat->diag.i            = (PetscInt *)ai;
92599551766SMark Adams     h_mat->diag.j            = (PetscInt *)aj;
92699551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
927365b711fSMark Adams #endif
928042217e8SBarry Smith     h_mat->diag.a = aa;
929042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
930042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
931042217e8SBarry Smith     // copy pointers and metadata to device
9329566063dSJacob Faibussowitsch     PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice));
9339566063dSJacob Faibussowitsch     PetscCall(PetscFree(h_mat));
934042217e8SBarry Smith   } else {
9359566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Reusing device matrix\n"));
936042217e8SBarry Smith   }
937042217e8SBarry Smith   *B             = d_mat;
938042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9393fa6b06aSMark Adams   PetscFunctionReturn(0);
9403fa6b06aSMark Adams }
941