1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 149371c9d4SSatish Balay struct VecCUDAEquals { 157e8381f9SStefano Zampini template <typename Tuple> 16d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 17d71ae5a4SJacob Faibussowitsch { 187e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 197e8381f9SStefano Zampini } 207e8381f9SStefano Zampini }; 217e8381f9SStefano Zampini 22d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 23d71ae5a4SJacob Faibussowitsch { 24cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 25cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 26cbc6b225SStefano Zampini 27cbc6b225SStefano Zampini PetscFunctionBegin; 28cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 29cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 309566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 409566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 419566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 429566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 43cbc6b225SStefano Zampini } 44cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 45cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 46cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 47cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 48cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 49cbc6b225SStefano Zampini PetscFunctionReturn(0); 50cbc6b225SStefano Zampini } 51cbc6b225SStefano Zampini 52d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 53d71ae5a4SJacob Faibussowitsch { 547e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 557e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 567e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 577e8381f9SStefano Zampini 587e8381f9SStefano Zampini PetscFunctionBegin; 59e61fc153SStefano Zampini if (cusp->coo_p && v) { 6008391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6108391a17SStefano Zampini THRUSTARRAY *w = NULL; 6208391a17SStefano Zampini 6308391a17SStefano Zampini if (isCudaMem(v)) { 6408391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6508391a17SStefano Zampini } else { 66e61fc153SStefano Zampini w = new THRUSTARRAY(n); 67e61fc153SStefano Zampini w->assign(v, v + n); 689566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 6908391a17SStefano Zampini d_v = w->data(); 7008391a17SStefano Zampini } 7108391a17SStefano Zampini 729371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 739371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 757e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 77e61fc153SStefano Zampini delete w; 789566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 799566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 807e8381f9SStefano Zampini } else { 819566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 829566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 837e8381f9SStefano Zampini } 847e8381f9SStefano Zampini PetscFunctionReturn(0); 857e8381f9SStefano Zampini } 867e8381f9SStefano Zampini 877e8381f9SStefano Zampini template <typename Tuple> 889371c9d4SSatish Balay struct IsNotOffDiagT { 897e8381f9SStefano Zampini PetscInt _cstart, _cend; 907e8381f9SStefano Zampini 917e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 929371c9d4SSatish Balay __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 937e8381f9SStefano Zampini }; 947e8381f9SStefano Zampini 959371c9d4SSatish Balay struct IsOffDiag { 967e8381f9SStefano Zampini PetscInt _cstart, _cend; 977e8381f9SStefano Zampini 987e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 999371c9d4SSatish Balay __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 1007e8381f9SStefano Zampini }; 1017e8381f9SStefano Zampini 1029371c9d4SSatish Balay struct GlobToLoc { 1037e8381f9SStefano Zampini PetscInt _start; 1047e8381f9SStefano Zampini 1057e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) { } 1069371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 1077e8381f9SStefano Zampini }; 1087e8381f9SStefano Zampini 109d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 110d71ae5a4SJacob Faibussowitsch { 1117e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 1127e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 11382a78a4eSJed Brown PetscInt N, *jj; 1147e8381f9SStefano Zampini size_t noff = 0; 115ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1167e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1177e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1187e8381f9SStefano Zampini 1197e8381f9SStefano Zampini PetscFunctionBegin; 1209566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->A)); 1219566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 1227e8381f9SStefano Zampini 1239566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1247e8381f9SStefano Zampini d_i.assign(coo_i, coo_i + n); 1257e8381f9SStefano Zampini d_j.assign(coo_j, coo_j + n); 1269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1277e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1287e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1297e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1307e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1317e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1327e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 1337e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 1347e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 1357e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 1367e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1377e8381f9SStefano Zampini } 1387e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 1397e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 1407e8381f9SStefano Zampini 1417e8381f9SStefano Zampini /* from global to local */ 1427e8381f9SStefano Zampini thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 1437e8381f9SStefano Zampini thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 1449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1457e8381f9SStefano Zampini 1467e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 1489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1497e8381f9SStefano Zampini auto o_j = d_j.begin(); 1509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 151ddea5d60SJunchao Zhang thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 1527e8381f9SStefano Zampini thrust::sort(thrust::device, o_j, d_j.end()); 153ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 1549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1557e8381f9SStefano Zampini noff = thrust::distance(o_j, wit); 156cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1579566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(noff, &b->garray)); 1589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 1609566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 1619566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 1629566063dSJacob Faibussowitsch PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 1632c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 1649566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 1657e8381f9SStefano Zampini 1669566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 1679566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 1689566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 1699566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 1709566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 1719566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 1727e8381f9SStefano Zampini 1737e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1749566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 1759566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 1769566063dSJacob Faibussowitsch PetscCall(PetscFree(jj)); 1777e8381f9SStefano Zampini 1789566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 1799566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 1807e8381f9SStefano Zampini 1819566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 1829566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 1839566063dSJacob Faibussowitsch PetscCall(MatSetUpMultiply_MPIAIJ(B)); 1847e8381f9SStefano Zampini PetscFunctionReturn(0); 1857e8381f9SStefano Zampini } 1869ae82921SPaul Mullowney 187d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 188d71ae5a4SJacob Faibussowitsch { 189cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 190219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 191cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 192219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 193219fbbafSJunchao Zhang PetscInt rstart, rend; 194219fbbafSJunchao Zhang 195219fbbafSJunchao Zhang PetscFunctionBegin; 1969566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 1979566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 198cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 199*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 200cbc6b225SStefano Zampini #else 2019566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 202cbc6b225SStefano Zampini #endif 2039566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 204cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 205cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2069566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 2079566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 208219fbbafSJunchao Zhang if (coo_i) { 2099566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 2109566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 211219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 212219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 2139371c9d4SSatish Balay if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 2149371c9d4SSatish Balay coo_basic = PETSC_FALSE; 2159371c9d4SSatish Balay break; 2169371c9d4SSatish Balay } 217219fbbafSJunchao Zhang } 218219fbbafSJunchao Zhang } 219219fbbafSJunchao Zhang } 220219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2219566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 222219fbbafSJunchao Zhang if (coo_basic) { 2239566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 224219fbbafSJunchao Zhang } else { 2259566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 226cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 227cbc6b225SStefano Zampini /* creates the GPU memory */ 2289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 230cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 231219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 232219fbbafSJunchao Zhang 233158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 2349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 235219fbbafSJunchao Zhang 236158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 2379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 238219fbbafSJunchao Zhang 2399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 2409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 2419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 242219fbbafSJunchao Zhang 2439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 2449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 2459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 246219fbbafSJunchao Zhang 2479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 2489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 2499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 250219fbbafSJunchao Zhang 251158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 253219fbbafSJunchao Zhang 254158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 256219fbbafSJunchao Zhang 2579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 260219fbbafSJunchao Zhang 2619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 264219fbbafSJunchao Zhang 2659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 266219fbbafSJunchao Zhang } 267219fbbafSJunchao Zhang PetscFunctionReturn(0); 268219fbbafSJunchao Zhang } 269219fbbafSJunchao Zhang 270d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 271d71ae5a4SJacob Faibussowitsch { 272219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 273219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 274219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 275219fbbafSJunchao Zhang } 276219fbbafSJunchao Zhang 277d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 278d71ae5a4SJacob Faibussowitsch { 279219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 280219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 281158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) { 282158ec288SJunchao Zhang PetscScalar sum = 0.0; 283158ec288SJunchao Zhang if (i < Annz) { 284158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 285158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 286158ec288SJunchao Zhang } else { 287158ec288SJunchao Zhang i -= Annz; 288158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 289158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 290158ec288SJunchao Zhang } 291158ec288SJunchao Zhang } 292158ec288SJunchao Zhang } 293158ec288SJunchao Zhang 294d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 295d71ae5a4SJacob Faibussowitsch { 296158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 297158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 298158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) { 299158ec288SJunchao Zhang if (i < Annz2) { 300158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 301158ec288SJunchao Zhang } else { 302158ec288SJunchao Zhang i -= Annz2; 303158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 304158ec288SJunchao Zhang } 305158ec288SJunchao Zhang } 306219fbbafSJunchao Zhang } 307219fbbafSJunchao Zhang 308d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 309d71ae5a4SJacob Faibussowitsch { 310219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 311219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 312219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B; 313158ec288SJunchao Zhang PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 314cbc6b225SStefano Zampini PetscScalar *Aa, *Ba = NULL; 315219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 316219fbbafSJunchao Zhang const PetscScalar *v1 = v; 317158ec288SJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 318158ec288SJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 319219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 320219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 321219fbbafSJunchao Zhang PetscMemType memtype; 322219fbbafSJunchao Zhang 323219fbbafSJunchao Zhang PetscFunctionBegin; 324219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 325cbc6b225SStefano Zampini PetscMPIInt size; 326cbc6b225SStefano Zampini 3279566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 3289566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 329219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 3319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 332219fbbafSJunchao Zhang } 333219fbbafSJunchao Zhang 334219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 3369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 337219fbbafSJunchao Zhang } else { 3389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 339158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 340219fbbafSJunchao Zhang } 341219fbbafSJunchao Zhang 342219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 343cbc6b225SStefano Zampini if (mpiaij->sendlen) { 3449371c9d4SSatish Balay MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 3459371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); 346cbc6b225SStefano Zampini } 347219fbbafSJunchao Zhang 348219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 349158ec288SJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 350219fbbafSJunchao Zhang /* Add local entries to A and B */ 351158ec288SJunchao Zhang if (Annz + Bnnz > 0) { 352158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 3539566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 354cbc6b225SStefano Zampini } 355158ec288SJunchao Zhang PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 356219fbbafSJunchao Zhang 357219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 358158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) { 359158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 3609566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 361cbc6b225SStefano Zampini } 362219fbbafSJunchao Zhang 363219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 365158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 366219fbbafSJunchao Zhang } else { 3679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 368158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 369219fbbafSJunchao Zhang } 3709566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 371219fbbafSJunchao Zhang } else { 3729566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 373219fbbafSJunchao Zhang } 374cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 375219fbbafSJunchao Zhang PetscFunctionReturn(0); 376219fbbafSJunchao Zhang } 377219fbbafSJunchao Zhang 378d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 379d71ae5a4SJacob Faibussowitsch { 380ed502f03SStefano Zampini Mat Ad, Ao; 381ed502f03SStefano Zampini const PetscInt *cmap; 382ed502f03SStefano Zampini 383ed502f03SStefano Zampini PetscFunctionBegin; 3849566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 3859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 386ed502f03SStefano Zampini if (glob) { 387ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 388ed502f03SStefano Zampini 3899566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 3909566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on)); 3919566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 3929566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx)); 393ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i; 394ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 3959566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 396ed502f03SStefano Zampini } 397ed502f03SStefano Zampini PetscFunctionReturn(0); 398ed502f03SStefano Zampini } 399ed502f03SStefano Zampini 400d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 401d71ae5a4SJacob Faibussowitsch { 402bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 403bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 4049ae82921SPaul Mullowney PetscInt i; 4059ae82921SPaul Mullowney 4069ae82921SPaul Mullowney PetscFunctionBegin; 4079566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 4089566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 4097e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 410ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 4119ae82921SPaul Mullowney } 4127e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 413ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 4149ae82921SPaul Mullowney } 4156a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 416*eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&b->colmap)); 4176a29ce69SStefano Zampini #else 4189566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 4196a29ce69SStefano Zampini #endif 4209566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 4219566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 4229566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 4236a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4249566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 4256a29ce69SStefano Zampini if (!b->A) { 4269566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 4279566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 4286a29ce69SStefano Zampini } 4296a29ce69SStefano Zampini if (!b->B) { 4306a29ce69SStefano Zampini PetscMPIInt size; 4319566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 4329566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 4339566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 4349ae82921SPaul Mullowney } 4359566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 4369566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 4379566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 4389566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 4399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 4409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 4419566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 4429566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 4439ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4449ae82921SPaul Mullowney PetscFunctionReturn(0); 4459ae82921SPaul Mullowney } 446e057df02SPaul Mullowney 447d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 448d71ae5a4SJacob Faibussowitsch { 4499ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 4509ae82921SPaul Mullowney 4519ae82921SPaul Mullowney PetscFunctionBegin; 4529566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4539566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 4549566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4559566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 4569ae82921SPaul Mullowney PetscFunctionReturn(0); 4579ae82921SPaul Mullowney } 458ca45077fSPaul Mullowney 459d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 460d71ae5a4SJacob Faibussowitsch { 4613fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 4623fa6b06aSMark Adams 4633fa6b06aSMark Adams PetscFunctionBegin; 4649566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 4659566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 4663fa6b06aSMark Adams PetscFunctionReturn(0); 4673fa6b06aSMark Adams } 468042217e8SBarry Smith 469d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 470d71ae5a4SJacob Faibussowitsch { 471fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 472fdc842d1SBarry Smith 473fdc842d1SBarry Smith PetscFunctionBegin; 4749566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4759566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 4769566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4779566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 478fdc842d1SBarry Smith PetscFunctionReturn(0); 479fdc842d1SBarry Smith } 480fdc842d1SBarry Smith 481d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 482d71ae5a4SJacob Faibussowitsch { 483ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 484ca45077fSPaul Mullowney 485ca45077fSPaul Mullowney PetscFunctionBegin; 4869566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 4879566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 4889566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 4899566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 490ca45077fSPaul Mullowney PetscFunctionReturn(0); 491ca45077fSPaul Mullowney } 4929ae82921SPaul Mullowney 493d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 494d71ae5a4SJacob Faibussowitsch { 495ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 496bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 497e057df02SPaul Mullowney 498ca45077fSPaul Mullowney PetscFunctionBegin; 499e057df02SPaul Mullowney switch (op) { 500d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_DIAG: 501d71ae5a4SJacob Faibussowitsch cusparseStruct->diagGPUMatFormat = format; 502d71ae5a4SJacob Faibussowitsch break; 503d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_OFFDIAG: 504d71ae5a4SJacob Faibussowitsch cusparseStruct->offdiagGPUMatFormat = format; 505d71ae5a4SJacob Faibussowitsch break; 506e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 507e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 508e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 509045c96e1SPaul Mullowney break; 510d71ae5a4SJacob Faibussowitsch default: 511d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 512045c96e1SPaul Mullowney } 513ca45077fSPaul Mullowney PetscFunctionReturn(0); 514ca45077fSPaul Mullowney } 515e057df02SPaul Mullowney 516d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 517d71ae5a4SJacob Faibussowitsch { 518e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 519e057df02SPaul Mullowney PetscBool flg; 520a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 521a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 5225fd66863SKarl Rupp 523e057df02SPaul Mullowney PetscFunctionBegin; 524d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 525e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 5269371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5279566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 5289371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 5299566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 5309371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5319566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 532e057df02SPaul Mullowney } 533d0609cedSBarry Smith PetscOptionsHeadEnd(); 534e057df02SPaul Mullowney PetscFunctionReturn(0); 535e057df02SPaul Mullowney } 536e057df02SPaul Mullowney 537d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 538d71ae5a4SJacob Faibussowitsch { 5393fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 540042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 541042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 542042217e8SBarry Smith 54334d6c7a5SJose E. Roman PetscFunctionBegin; 5449566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 5459566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 546042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 547042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 548042217e8SBarry Smith 5499566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 5509566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5529566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 55399551766SMark Adams if (h_mat->allocated_indices) { 5549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5559566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 55699551766SMark Adams if (h_mat->offdiag.j) { 5579566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5589566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 55999551766SMark Adams } 56099551766SMark Adams } 5619566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5629566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 563042217e8SBarry Smith cusp->deviceMat = NULL; 5643fa6b06aSMark Adams } 56534d6c7a5SJose E. Roman PetscFunctionReturn(0); 56634d6c7a5SJose E. Roman } 56734d6c7a5SJose E. Roman 568d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 569d71ae5a4SJacob Faibussowitsch { 5703fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 5713fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 572bbf3fe20SPaul Mullowney 573bbf3fe20SPaul Mullowney PetscFunctionBegin; 57428b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 5753fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 576042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 577042217e8SBarry Smith 5789566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Have device matrix\n")); 5799566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5819566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 58299551766SMark Adams if (h_mat->allocated_indices) { 5839566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5849566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 58599551766SMark Adams if (h_mat->offdiag.j) { 5869566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5879566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 58899551766SMark Adams } 58999551766SMark Adams } 5909566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5919566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 5923fa6b06aSMark Adams } 593cbc6b225SStefano Zampini /* Free COO */ 5949566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 595acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 5969566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 5979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 5989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 5999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 6009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 6019566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 6029566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 603bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 604bbf3fe20SPaul Mullowney } 605ca45077fSPaul Mullowney 606d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) 607d71ae5a4SJacob Faibussowitsch { 608bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 6093338378cSStefano Zampini Mat A; 6109ae82921SPaul Mullowney 6119ae82921SPaul Mullowney PetscFunctionBegin; 6129566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6139566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 6149566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 6153338378cSStefano Zampini A = *newmat; 6166f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6179566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 6189566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 61934136279SStefano Zampini 620bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data; 6219566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 6229566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 6239566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 624d98d7c49SStefano Zampini 62548a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 6262205254eSKarl Rupp 62734d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 628bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 629fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 630bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 631bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 632bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6333fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6344e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 6352205254eSKarl Rupp 6369566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 6379566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6389566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6399566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6409566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6419566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 642ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6439566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 644ae48a8d0SStefano Zampini #endif 6459ae82921SPaul Mullowney PetscFunctionReturn(0); 6469ae82921SPaul Mullowney } 6479ae82921SPaul Mullowney 648d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 649d71ae5a4SJacob Faibussowitsch { 6503338378cSStefano Zampini PetscFunctionBegin; 6519566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6529566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 6539566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 6543338378cSStefano Zampini PetscFunctionReturn(0); 6553338378cSStefano Zampini } 6563338378cSStefano Zampini 6573f9c0db1SPaul Mullowney /*@ 65811a5261eSBarry Smith MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 659e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 66011a5261eSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 661e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 662e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 663e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 6649ae82921SPaul Mullowney 665d083f849SBarry Smith Collective 666e057df02SPaul Mullowney 667e057df02SPaul Mullowney Input Parameters: 66811a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 669e057df02SPaul Mullowney . m - number of rows 670e057df02SPaul Mullowney . n - number of columns 671e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 672e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 6730298fd71SBarry Smith (possibly different for each row) or NULL 674e057df02SPaul Mullowney 675e057df02SPaul Mullowney Output Parameter: 676e057df02SPaul Mullowney . A - the matrix 677e057df02SPaul Mullowney 67811a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 679e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 68011a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 681e057df02SPaul Mullowney 682e057df02SPaul Mullowney Notes: 683e057df02SPaul Mullowney If nnz is given then nz is ignored 684e057df02SPaul Mullowney 68511a5261eSBarry Smith The AIJ format, also called the 686e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 687e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 688e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 689e057df02SPaul Mullowney 690e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 69111a5261eSBarry Smith Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 692e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 693e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 694e057df02SPaul Mullowney 695e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 696e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 697e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 698e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 699e057df02SPaul Mullowney 700e057df02SPaul Mullowney Level: intermediate 701e057df02SPaul Mullowney 70211a5261eSBarry Smith .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 703e057df02SPaul Mullowney @*/ 704d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 705d71ae5a4SJacob Faibussowitsch { 7069ae82921SPaul Mullowney PetscMPIInt size; 7079ae82921SPaul Mullowney 7089ae82921SPaul Mullowney PetscFunctionBegin; 7099566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 7109566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N)); 7119566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 7129ae82921SPaul Mullowney if (size > 1) { 7139566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 7149566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 7159ae82921SPaul Mullowney } else { 7169566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 7179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 7189ae82921SPaul Mullowney } 7199ae82921SPaul Mullowney PetscFunctionReturn(0); 7209ae82921SPaul Mullowney } 7219ae82921SPaul Mullowney 7223ca39a21SBarry Smith /*MC 72311a5261eSBarry Smith MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 724e057df02SPaul Mullowney 72511a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 7262692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 72711a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 7289ae82921SPaul Mullowney 72911a5261eSBarry Smith This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 73011a5261eSBarry Smith and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 73111a5261eSBarry Smith `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 7329ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7339ae82921SPaul Mullowney the above preallocation routines for simplicity. 7349ae82921SPaul Mullowney 7359ae82921SPaul Mullowney Options Database Keys: 73611a5261eSBarry Smith + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 7378468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 73811a5261eSBarry Smith . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 73911a5261eSBarry Smith - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 7409ae82921SPaul Mullowney 7419ae82921SPaul Mullowney Level: beginner 7429ae82921SPaul Mullowney 743db781477SPatrick Sanan .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 7446bb69460SJunchao Zhang M*/ 7456bb69460SJunchao Zhang 7466bb69460SJunchao Zhang /*MC 74711a5261eSBarry Smith MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 7486bb69460SJunchao Zhang 7496bb69460SJunchao Zhang Level: beginner 7506bb69460SJunchao Zhang 751db781477SPatrick Sanan .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 7529ae82921SPaul Mullowney M*/ 7533fa6b06aSMark Adams 754042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 755d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 756d71ae5a4SJacob Faibussowitsch { 757042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 758042217e8SBarry Smith PetscMPIInt size; 759042217e8SBarry Smith int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 760042217e8SBarry Smith PetscScalar *aa = NULL, *ba = NULL; 76199551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 762042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 763042217e8SBarry Smith CsrMatrix *matrixA = NULL, *matrixB = NULL; 7643fa6b06aSMark Adams 7653fa6b06aSMark Adams PetscFunctionBegin; 76628b400f6SJacob Faibussowitsch PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 767042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 7683fa6b06aSMark Adams *B = NULL; 7693fa6b06aSMark Adams PetscFunctionReturn(0); 7703fa6b06aSMark Adams } 7719566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 77299551766SMark Adams // get jaca 7733fa6b06aSMark Adams if (size == 1) { 774042217e8SBarry Smith PetscBool isseqaij; 775042217e8SBarry Smith 7769566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 777042217e8SBarry Smith if (isseqaij) { 7783fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)A->data; 77928b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 780042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 781042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 7829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 7833fa6b06aSMark Adams } else { 7843fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 78528b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 786042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 7873fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)aij->A->data; 788042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 789042217e8SBarry Smith d_mat = spptr->deviceMat; 7909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 7913fa6b06aSMark Adams } 792042217e8SBarry Smith if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 793042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 79428b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 795042217e8SBarry Smith matrixA = (CsrMatrix *)matstruct->mat; 796042217e8SBarry Smith bi = NULL; 797042217e8SBarry Smith bj = NULL; 798042217e8SBarry Smith ba = NULL; 799042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 800042217e8SBarry Smith } else { 801042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 80228b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 803042217e8SBarry Smith jaca = (Mat_SeqAIJ *)aij->A->data; 80499551766SMark Adams jacb = (Mat_SeqAIJ *)aij->B->data; 805042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 8063fa6b06aSMark Adams 80708401ef6SPierre Jolivet PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 808042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 809042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 81008401ef6SPierre Jolivet PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 811042217e8SBarry Smith if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 8129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 814042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 815042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 81628b400f6SJacob Faibussowitsch PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 81728b400f6SJacob Faibussowitsch PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 818042217e8SBarry Smith matrixA = (CsrMatrix *)matstructA->mat; 819042217e8SBarry Smith matrixB = (CsrMatrix *)matstructB->mat; 8203fa6b06aSMark Adams if (jacb->compressedrow.use) { 821042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 822042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 823042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 8243fa6b06aSMark Adams } 825042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 826042217e8SBarry Smith } else { 827042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 828042217e8SBarry Smith } 829042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 830042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 831042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 832042217e8SBarry Smith d_mat = spptr->deviceMat; 833042217e8SBarry Smith } 8343fa6b06aSMark Adams if (jaca->compressedrow.use) { 835042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 836042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 837042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 8383fa6b06aSMark Adams } 839042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8403fa6b06aSMark Adams } else { 841042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8423fa6b06aSMark Adams } 843042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 844042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 845042217e8SBarry Smith 846042217e8SBarry Smith if (!d_mat) { 847042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 848042217e8SBarry Smith 849042217e8SBarry Smith // create and populate strucy on host and copy on device 8509566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Create device matrix\n")); 8519566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 8529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 853042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 854042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 85599551766SMark Adams PetscInt *colmap; 856042217e8SBarry Smith PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 857042217e8SBarry Smith 85808401ef6SPierre Jolivet PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 859042217e8SBarry Smith 8609566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(N + 1, &colmap)); 861042217e8SBarry Smith for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 862365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 863365b711fSMark Adams { // have to make a long version of these 86499551766SMark Adams int *h_bi32, *h_bj32; 86599551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 8669566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 8679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 86899551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 8699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 87099551766SMark Adams for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 87199551766SMark Adams 8729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 8739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 8749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 8759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 87699551766SMark Adams 87799551766SMark Adams h_mat->offdiag.i = d_bi64; 87899551766SMark Adams h_mat->offdiag.j = d_bj64; 87999551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 88099551766SMark Adams 8819566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 882365b711fSMark Adams } 883365b711fSMark Adams #else 88499551766SMark Adams h_mat->offdiag.i = (PetscInt *)bi; 88599551766SMark Adams h_mat->offdiag.j = (PetscInt *)bj; 88699551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 887365b711fSMark Adams #endif 888042217e8SBarry Smith h_mat->offdiag.a = ba; 889042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 890042217e8SBarry Smith 8919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 8929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 8939566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 894042217e8SBarry Smith } 895042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 896042217e8SBarry Smith h_mat->rend = A->rmap->rend; 897042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 898042217e8SBarry Smith h_mat->cend = A->cmap->rend; 89949b994a9SMark Adams h_mat->M = A->cmap->N; 900365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 901365b711fSMark Adams { 90299551766SMark Adams int *h_ai32, *h_aj32; 90399551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 90463a3b9bcSJacob Faibussowitsch 90563a3b9bcSJacob Faibussowitsch static_assert(sizeof(PetscInt) == 8, ""); 9069566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 9079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 90899551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 9099566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 91099551766SMark Adams for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 91199551766SMark Adams 9129566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 9139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 9149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 9159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 91699551766SMark Adams 91799551766SMark Adams h_mat->diag.i = d_ai64; 91899551766SMark Adams h_mat->diag.j = d_aj64; 91999551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 92099551766SMark Adams 9219566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 922365b711fSMark Adams } 923365b711fSMark Adams #else 92499551766SMark Adams h_mat->diag.i = (PetscInt *)ai; 92599551766SMark Adams h_mat->diag.j = (PetscInt *)aj; 92699551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 927365b711fSMark Adams #endif 928042217e8SBarry Smith h_mat->diag.a = aa; 929042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 930042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 931042217e8SBarry Smith // copy pointers and metadata to device 9329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 9339566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 934042217e8SBarry Smith } else { 9359566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Reusing device matrix\n")); 936042217e8SBarry Smith } 937042217e8SBarry Smith *B = d_mat; 938042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9393fa6b06aSMark Adams PetscFunctionReturn(0); 9403fa6b06aSMark Adams } 941