199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 20fd2b57fSKarl Rupp 33d13b8fdSMatthew G. Knepley #include <petscconf.h> 49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 77e8381f9SStefano Zampini #include <thrust/advance.h> 8a2cee5feSJed Brown #include <thrust/partition.h> 9a2cee5feSJed Brown #include <thrust/sort.h> 10a2cee5feSJed Brown #include <thrust/unique.h> 114eb5378fSStefano Zampini #include <petscsf.h> 127e8381f9SStefano Zampini 139371c9d4SSatish Balay struct VecCUDAEquals { 147e8381f9SStefano Zampini template <typename Tuple> 15d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t) 16d71ae5a4SJacob Faibussowitsch { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 21d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 22d71ae5a4SJacob Faibussowitsch { 23cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 24cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 25cbc6b225SStefano Zampini 26cbc6b225SStefano Zampini PetscFunctionBegin; 273ba16761SJacob Faibussowitsch if (!cusparseStruct) PetscFunctionReturn(PETSC_SUCCESS); 28cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 299566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 309566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 409566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 419566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 42cbc6b225SStefano Zampini } 43cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 44cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 45cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 46cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 47cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 49cbc6b225SStefano Zampini } 50cbc6b225SStefano Zampini 51d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 52d71ae5a4SJacob Faibussowitsch { 537e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 547e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 557e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 567e8381f9SStefano Zampini 577e8381f9SStefano Zampini PetscFunctionBegin; 58e61fc153SStefano Zampini if (cusp->coo_p && v) { 5908391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6008391a17SStefano Zampini THRUSTARRAY *w = NULL; 6108391a17SStefano Zampini 6208391a17SStefano Zampini if (isCudaMem(v)) { 6308391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6408391a17SStefano Zampini } else { 65e61fc153SStefano Zampini w = new THRUSTARRAY(n); 66e61fc153SStefano Zampini w->assign(v, v + n); 679566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 6808391a17SStefano Zampini d_v = w->data(); 6908391a17SStefano Zampini } 7008391a17SStefano Zampini 719371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 729371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 747e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 76e61fc153SStefano Zampini delete w; 779566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 789566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 797e8381f9SStefano Zampini } else { 809566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 819566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 827e8381f9SStefano Zampini } 833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 847e8381f9SStefano Zampini } 857e8381f9SStefano Zampini 867e8381f9SStefano Zampini template <typename Tuple> 879371c9d4SSatish Balay struct IsNotOffDiagT { 887e8381f9SStefano Zampini PetscInt _cstart, _cend; 897e8381f9SStefano Zampini 907e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 919371c9d4SSatish Balay __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 927e8381f9SStefano Zampini }; 937e8381f9SStefano Zampini 949371c9d4SSatish Balay struct IsOffDiag { 957e8381f9SStefano Zampini PetscInt _cstart, _cend; 967e8381f9SStefano Zampini 977e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 989371c9d4SSatish Balay __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 997e8381f9SStefano Zampini }; 1007e8381f9SStefano Zampini 1019371c9d4SSatish Balay struct GlobToLoc { 1027e8381f9SStefano Zampini PetscInt _start; 1037e8381f9SStefano Zampini 1047e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) { } 1059371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 1067e8381f9SStefano Zampini }; 1077e8381f9SStefano Zampini 108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) 109d71ae5a4SJacob Faibussowitsch { 1107e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 1117e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 11282a78a4eSJed Brown PetscInt N, *jj; 1137e8381f9SStefano Zampini size_t noff = 0; 114ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1157e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1167e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1177e8381f9SStefano Zampini 1187e8381f9SStefano Zampini PetscFunctionBegin; 1199566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->A)); 1209566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 1217e8381f9SStefano Zampini 1229566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1237e8381f9SStefano Zampini d_i.assign(coo_i, coo_i + n); 1247e8381f9SStefano Zampini d_j.assign(coo_j, coo_j + n); 1259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1267e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1277e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1287e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1297e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1307e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1317e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 1327e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 1337e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 1347e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 1357e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1367e8381f9SStefano Zampini } 1377e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 1387e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 1397e8381f9SStefano Zampini 1407e8381f9SStefano Zampini /* from global to local */ 1417e8381f9SStefano Zampini thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 1427e8381f9SStefano Zampini thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 1439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1447e8381f9SStefano Zampini 1457e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1469566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 1479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1487e8381f9SStefano Zampini auto o_j = d_j.begin(); 1499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 150ddea5d60SJunchao Zhang thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 1517e8381f9SStefano Zampini thrust::sort(thrust::device, o_j, d_j.end()); 152ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 1539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1547e8381f9SStefano Zampini noff = thrust::distance(o_j, wit); 155cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(noff, &b->garray)); 1579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 1599566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 1609566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 1619566063dSJacob Faibussowitsch PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 1622c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 1639566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 1647e8381f9SStefano Zampini 1659566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 1669566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 1679566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 1689566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 1699566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 1709566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 1717e8381f9SStefano Zampini 1727e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1739566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 1749566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 1759566063dSJacob Faibussowitsch PetscCall(PetscFree(jj)); 1767e8381f9SStefano Zampini 1779566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 1789566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 1797e8381f9SStefano Zampini 1809566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 1819566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 1829566063dSJacob Faibussowitsch PetscCall(MatSetUpMultiply_MPIAIJ(B)); 1833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1847e8381f9SStefano Zampini } 1859ae82921SPaul Mullowney 186d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 187d71ae5a4SJacob Faibussowitsch { 188cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 189219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 190cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 191219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 192219fbbafSJunchao Zhang PetscInt rstart, rend; 193219fbbafSJunchao Zhang 194219fbbafSJunchao Zhang PetscFunctionBegin; 1959566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 1969566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 197cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 198eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&mpiaij->colmap)); 199cbc6b225SStefano Zampini #else 2009566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 201cbc6b225SStefano Zampini #endif 2029566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 203cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 204cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2059566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 2069566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 207219fbbafSJunchao Zhang if (coo_i) { 2089566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 2099566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 210219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 211219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 2129371c9d4SSatish Balay if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 2139371c9d4SSatish Balay coo_basic = PETSC_FALSE; 2149371c9d4SSatish Balay break; 2159371c9d4SSatish Balay } 216219fbbafSJunchao Zhang } 217219fbbafSJunchao Zhang } 218219fbbafSJunchao Zhang } 219219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2209566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 221219fbbafSJunchao Zhang if (coo_basic) { 2229566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 223219fbbafSJunchao Zhang } else { 2249566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 225cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 226cbc6b225SStefano Zampini /* creates the GPU memory */ 2279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 229cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 230219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 231219fbbafSJunchao Zhang 232158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 2339566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 234219fbbafSJunchao Zhang 235158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 2369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 237219fbbafSJunchao Zhang 2389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 2399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 2409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 241219fbbafSJunchao Zhang 2429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 2439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 2449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 245219fbbafSJunchao Zhang 2469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 2479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 2489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 249219fbbafSJunchao Zhang 250158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 252219fbbafSJunchao Zhang 253158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 255219fbbafSJunchao Zhang 2569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 259219fbbafSJunchao Zhang 2609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263219fbbafSJunchao Zhang 2649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 265219fbbafSJunchao Zhang } 2663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 267219fbbafSJunchao Zhang } 268219fbbafSJunchao Zhang 269d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) 270d71ae5a4SJacob Faibussowitsch { 271219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 272219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 273219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 274219fbbafSJunchao Zhang } 275219fbbafSJunchao Zhang 276d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) 277d71ae5a4SJacob Faibussowitsch { 278219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 279219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 280158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) { 281158ec288SJunchao Zhang PetscScalar sum = 0.0; 282158ec288SJunchao Zhang if (i < Annz) { 283158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 284158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 285158ec288SJunchao Zhang } else { 286158ec288SJunchao Zhang i -= Annz; 287158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 288158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 289158ec288SJunchao Zhang } 290158ec288SJunchao Zhang } 291158ec288SJunchao Zhang } 292158ec288SJunchao Zhang 293d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) 294d71ae5a4SJacob Faibussowitsch { 295158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 296158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 297158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) { 298158ec288SJunchao Zhang if (i < Annz2) { 299158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 300158ec288SJunchao Zhang } else { 301158ec288SJunchao Zhang i -= Annz2; 302158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 303158ec288SJunchao Zhang } 304158ec288SJunchao Zhang } 305219fbbafSJunchao Zhang } 306219fbbafSJunchao Zhang 307d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) 308d71ae5a4SJacob Faibussowitsch { 309219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 310219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 311219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B; 312158ec288SJunchao Zhang PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 313cbc6b225SStefano Zampini PetscScalar *Aa, *Ba = NULL; 314219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 315219fbbafSJunchao Zhang const PetscScalar *v1 = v; 316158ec288SJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 317158ec288SJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 318219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 319219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 320219fbbafSJunchao Zhang PetscMemType memtype; 321219fbbafSJunchao Zhang 322219fbbafSJunchao Zhang PetscFunctionBegin; 323219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 324cbc6b225SStefano Zampini PetscMPIInt size; 325cbc6b225SStefano Zampini 3269566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 3279566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 328219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 3309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 331219fbbafSJunchao Zhang } 332219fbbafSJunchao Zhang 333219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 3359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 336219fbbafSJunchao Zhang } else { 3379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 338158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 339219fbbafSJunchao Zhang } 340219fbbafSJunchao Zhang 341219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 342cbc6b225SStefano Zampini if (mpiaij->sendlen) { 3439371c9d4SSatish Balay MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 3449371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); 345cbc6b225SStefano Zampini } 346219fbbafSJunchao Zhang 347219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 348158ec288SJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 349219fbbafSJunchao Zhang /* Add local entries to A and B */ 350158ec288SJunchao Zhang if (Annz + Bnnz > 0) { 351158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 3529566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 353cbc6b225SStefano Zampini } 354158ec288SJunchao Zhang PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 355219fbbafSJunchao Zhang 356219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 357158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) { 358158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 3599566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 360cbc6b225SStefano Zampini } 361219fbbafSJunchao Zhang 362219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 364158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 365219fbbafSJunchao Zhang } else { 3669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 367158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 368219fbbafSJunchao Zhang } 3699566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 370219fbbafSJunchao Zhang } else { 3719566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 372219fbbafSJunchao Zhang } 373cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 3743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 375219fbbafSJunchao Zhang } 376219fbbafSJunchao Zhang 377d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) 378d71ae5a4SJacob Faibussowitsch { 379ed502f03SStefano Zampini Mat Ad, Ao; 380ed502f03SStefano Zampini const PetscInt *cmap; 381ed502f03SStefano Zampini 382ed502f03SStefano Zampini PetscFunctionBegin; 3839566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 3849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 385ed502f03SStefano Zampini if (glob) { 386ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 387ed502f03SStefano Zampini 3889566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 3899566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on)); 3909566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 3919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx)); 392ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i; 393ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 3949566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 395ed502f03SStefano Zampini } 3963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 397ed502f03SStefano Zampini } 398ed502f03SStefano Zampini 399d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) 400d71ae5a4SJacob Faibussowitsch { 401bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 402bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 4039ae82921SPaul Mullowney PetscInt i; 4049ae82921SPaul Mullowney 4059ae82921SPaul Mullowney PetscFunctionBegin; 4069566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 4079566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 4087e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 409ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); 4109ae82921SPaul Mullowney } 4117e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 412ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); 4139ae82921SPaul Mullowney } 4146a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 415eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&b->colmap)); 4166a29ce69SStefano Zampini #else 4179566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 4186a29ce69SStefano Zampini #endif 4199566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 4209566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 4219566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 4226a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4239566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 4246a29ce69SStefano Zampini if (!b->A) { 4259566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 4269566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 4276a29ce69SStefano Zampini } 4286a29ce69SStefano Zampini if (!b->B) { 4296a29ce69SStefano Zampini PetscMPIInt size; 4309566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 4319566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 4329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 4339ae82921SPaul Mullowney } 4349566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 4359566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 4369566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 4379566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 4389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 4399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 4409566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 4419566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 4429ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4449ae82921SPaul Mullowney } 445e057df02SPaul Mullowney 446d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 447d71ae5a4SJacob Faibussowitsch { 4489ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 4499ae82921SPaul Mullowney 4509ae82921SPaul Mullowney PetscFunctionBegin; 4519566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4529566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 4539566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4549566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 4553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4569ae82921SPaul Mullowney } 457ca45077fSPaul Mullowney 458d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 459d71ae5a4SJacob Faibussowitsch { 4603fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 4613fa6b06aSMark Adams 4623fa6b06aSMark Adams PetscFunctionBegin; 4639566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 4649566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 4653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4663fa6b06aSMark Adams } 467042217e8SBarry Smith 468d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) 469d71ae5a4SJacob Faibussowitsch { 470fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 471fdc842d1SBarry Smith 472fdc842d1SBarry Smith PetscFunctionBegin; 4739566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4749566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 4759566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4769566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 4773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 478fdc842d1SBarry Smith } 479fdc842d1SBarry Smith 480d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) 481d71ae5a4SJacob Faibussowitsch { 482ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 483ca45077fSPaul Mullowney 484ca45077fSPaul Mullowney PetscFunctionBegin; 4859566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 4869566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 4879566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 4889566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 4893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 490ca45077fSPaul Mullowney } 4919ae82921SPaul Mullowney 492d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) 493d71ae5a4SJacob Faibussowitsch { 494ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 495bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 496e057df02SPaul Mullowney 497ca45077fSPaul Mullowney PetscFunctionBegin; 498e057df02SPaul Mullowney switch (op) { 499d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_DIAG: 500d71ae5a4SJacob Faibussowitsch cusparseStruct->diagGPUMatFormat = format; 501d71ae5a4SJacob Faibussowitsch break; 502d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_OFFDIAG: 503d71ae5a4SJacob Faibussowitsch cusparseStruct->offdiagGPUMatFormat = format; 504d71ae5a4SJacob Faibussowitsch break; 505e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 506e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 507e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 508045c96e1SPaul Mullowney break; 509d71ae5a4SJacob Faibussowitsch default: 510d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 511045c96e1SPaul Mullowney } 5123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 513ca45077fSPaul Mullowney } 514e057df02SPaul Mullowney 515d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) 516d71ae5a4SJacob Faibussowitsch { 517e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 518e057df02SPaul Mullowney PetscBool flg; 519a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 520a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 5215fd66863SKarl Rupp 522e057df02SPaul Mullowney PetscFunctionBegin; 523d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 524e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 5259371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5269566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 5279371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 5289566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 5299371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5309566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 531e057df02SPaul Mullowney } 532d0609cedSBarry Smith PetscOptionsHeadEnd(); 5333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 534e057df02SPaul Mullowney } 535e057df02SPaul Mullowney 536d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) 537d71ae5a4SJacob Faibussowitsch { 5383fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 539042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 540042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 541042217e8SBarry Smith 54234d6c7a5SJose E. Roman PetscFunctionBegin; 5439566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 5449566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 545042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 546042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 547042217e8SBarry Smith 5489566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 5499566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5519566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 55299551766SMark Adams if (h_mat->allocated_indices) { 5539566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5549566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 55599551766SMark Adams if (h_mat->offdiag.j) { 5569566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5579566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 55899551766SMark Adams } 55999551766SMark Adams } 5609566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5619566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 562042217e8SBarry Smith cusp->deviceMat = NULL; 5633fa6b06aSMark Adams } 5643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 56534d6c7a5SJose E. Roman } 56634d6c7a5SJose E. Roman 567d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 568d71ae5a4SJacob Faibussowitsch { 5693fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 5703fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 571bbf3fe20SPaul Mullowney 572bbf3fe20SPaul Mullowney PetscFunctionBegin; 57328b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 5743fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 575042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 576042217e8SBarry Smith 5779566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Have device matrix\n")); 5789566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5809566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 58199551766SMark Adams if (h_mat->allocated_indices) { 5829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5839566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 58499551766SMark Adams if (h_mat->offdiag.j) { 5859566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5869566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 58799551766SMark Adams } 58899551766SMark Adams } 5899566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5909566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 5913fa6b06aSMark Adams } 592cbc6b225SStefano Zampini /* Free COO */ 5939566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 594acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 5959566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 5969566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 5979566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 5989566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 5999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 6009566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 6019566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 6023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 603bbf3fe20SPaul Mullowney } 604ca45077fSPaul Mullowney 605*26cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */ 606*26cec326SBarry Smith #define TYPE AIJ 607*26cec326SBarry Smith #define TYPE_AIJ 608*26cec326SBarry Smith #define SUB_TYPE_CUSPARSE 609*26cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h" 610*26cec326SBarry Smith #undef TYPE 611*26cec326SBarry Smith #undef TYPE_AIJ 612*26cec326SBarry Smith #undef SUB_TYPE_CUSPARSE 613*26cec326SBarry Smith 614*26cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A) 615*26cec326SBarry Smith { 616*26cec326SBarry Smith Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data; 617*26cec326SBarry Smith Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 618*26cec326SBarry Smith 619*26cec326SBarry Smith PetscFunctionBegin; 620*26cec326SBarry Smith PetscCall(MatSetUp_MPI_Hash(A)); 621*26cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 622*26cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 623*26cec326SBarry Smith A->preallocated = PETSC_TRUE; 624*26cec326SBarry Smith PetscFunctionReturn(PETSC_SUCCESS); 625*26cec326SBarry Smith } 626*26cec326SBarry Smith 6278eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat) 628d71ae5a4SJacob Faibussowitsch { 629bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 6303338378cSStefano Zampini Mat A; 6319ae82921SPaul Mullowney 6329ae82921SPaul Mullowney PetscFunctionBegin; 6339566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6349566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 6359566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 6363338378cSStefano Zampini A = *newmat; 6376f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6389566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 6399566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 64034136279SStefano Zampini 641bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data; 6429566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 6439566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 6449566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 645d98d7c49SStefano Zampini 64648a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 6472205254eSKarl Rupp 64834d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 649bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 650fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 651bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 652bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 653bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6543fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6554e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 656*26cec326SBarry Smith A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE; 6572205254eSKarl Rupp 6589566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 6599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6609566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6639566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 664ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6659566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 666ae48a8d0SStefano Zampini #endif 6673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6689ae82921SPaul Mullowney } 6699ae82921SPaul Mullowney 670d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 671d71ae5a4SJacob Faibussowitsch { 6723338378cSStefano Zampini PetscFunctionBegin; 6739566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6749566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 6759566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 6763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6773338378cSStefano Zampini } 6783338378cSStefano Zampini 6793f9c0db1SPaul Mullowney /*@ 68011a5261eSBarry Smith MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format 681e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 68211a5261eSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations. For good matrix 683e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 684e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 685e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 6869ae82921SPaul Mullowney 687d083f849SBarry Smith Collective 688e057df02SPaul Mullowney 689e057df02SPaul Mullowney Input Parameters: 69011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 691e057df02SPaul Mullowney . m - number of rows 692e057df02SPaul Mullowney . n - number of columns 693e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 694e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 6950298fd71SBarry Smith (possibly different for each row) or NULL 696e057df02SPaul Mullowney 697e057df02SPaul Mullowney Output Parameter: 698e057df02SPaul Mullowney . A - the matrix 699e057df02SPaul Mullowney 70011a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 701e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 70211a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 703e057df02SPaul Mullowney 704e057df02SPaul Mullowney Notes: 705e057df02SPaul Mullowney If nnz is given then nz is ignored 706e057df02SPaul Mullowney 70711a5261eSBarry Smith The AIJ format, also called the 708e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 709e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 710e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 711e057df02SPaul Mullowney 712e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 71311a5261eSBarry Smith Set nz = `PETSC_DEFAULT` and nnz = NULL for PETSc to control dynamic memory 714e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 715e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 716e057df02SPaul Mullowney 717e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 718e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 719e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 720e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 721e057df02SPaul Mullowney 722e057df02SPaul Mullowney Level: intermediate 723e057df02SPaul Mullowney 72411a5261eSBarry Smith .seealso: `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 725e057df02SPaul Mullowney @*/ 726d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) 727d71ae5a4SJacob Faibussowitsch { 7289ae82921SPaul Mullowney PetscMPIInt size; 7299ae82921SPaul Mullowney 7309ae82921SPaul Mullowney PetscFunctionBegin; 7319566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 7329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N)); 7339566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 7349ae82921SPaul Mullowney if (size > 1) { 7359566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 7369566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 7379ae82921SPaul Mullowney } else { 7389566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 7399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 7409ae82921SPaul Mullowney } 7413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7429ae82921SPaul Mullowney } 7439ae82921SPaul Mullowney 7443ca39a21SBarry Smith /*MC 74511a5261eSBarry Smith MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`. 746e057df02SPaul Mullowney 74711a5261eSBarry Smith A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either 7482692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 74911a5261eSBarry Smith All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library. 7509ae82921SPaul Mullowney 75111a5261eSBarry Smith This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator, 75211a5261eSBarry Smith and `MATMPIAIJCUSPARSE` otherwise. As a result, for single process communicators, 75311a5261eSBarry Smith `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported 7549ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7559ae82921SPaul Mullowney the above preallocation routines for simplicity. 7569ae82921SPaul Mullowney 7579ae82921SPaul Mullowney Options Database Keys: 75811a5261eSBarry Smith + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE` during a call to `MatSetFromOptions()` 7598468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 76011a5261eSBarry Smith . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 76111a5261eSBarry Smith - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to `MatSetFromOptions()`. Other options include ell (ellpack) or hyb (hybrid). 7629ae82921SPaul Mullowney 7639ae82921SPaul Mullowney Level: beginner 7649ae82921SPaul Mullowney 765db781477SPatrick Sanan .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 7666bb69460SJunchao Zhang M*/ 7676bb69460SJunchao Zhang 7686bb69460SJunchao Zhang /*MC 76911a5261eSBarry Smith MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`. 7706bb69460SJunchao Zhang 7716bb69460SJunchao Zhang Level: beginner 7726bb69460SJunchao Zhang 773db781477SPatrick Sanan .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 7749ae82921SPaul Mullowney M*/ 7753fa6b06aSMark Adams 776042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 777d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 778d71ae5a4SJacob Faibussowitsch { 779042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 780042217e8SBarry Smith PetscMPIInt size; 781042217e8SBarry Smith int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 782042217e8SBarry Smith PetscScalar *aa = NULL, *ba = NULL; 78399551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 784042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 785042217e8SBarry Smith CsrMatrix *matrixA = NULL, *matrixB = NULL; 7863fa6b06aSMark Adams 7873fa6b06aSMark Adams PetscFunctionBegin; 78828b400f6SJacob Faibussowitsch PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 789042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 7903fa6b06aSMark Adams *B = NULL; 7913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7923fa6b06aSMark Adams } 7939566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 79499551766SMark Adams // get jaca 7953fa6b06aSMark Adams if (size == 1) { 796042217e8SBarry Smith PetscBool isseqaij; 797042217e8SBarry Smith 7989566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 799042217e8SBarry Smith if (isseqaij) { 8003fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)A->data; 80128b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 802042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 803042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 8049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 8053fa6b06aSMark Adams } else { 8063fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 80728b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 808042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 8093fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)aij->A->data; 810042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 811042217e8SBarry Smith d_mat = spptr->deviceMat; 8129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8133fa6b06aSMark Adams } 814042217e8SBarry Smith if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 815042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 81628b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 817042217e8SBarry Smith matrixA = (CsrMatrix *)matstruct->mat; 818042217e8SBarry Smith bi = NULL; 819042217e8SBarry Smith bj = NULL; 820042217e8SBarry Smith ba = NULL; 821042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 822042217e8SBarry Smith } else { 823042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 82428b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 825042217e8SBarry Smith jaca = (Mat_SeqAIJ *)aij->A->data; 82699551766SMark Adams jacb = (Mat_SeqAIJ *)aij->B->data; 827042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 8283fa6b06aSMark Adams 82908401ef6SPierre Jolivet PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 830042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 831042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 83208401ef6SPierre Jolivet PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 833042217e8SBarry Smith if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 8349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 836042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 837042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 83828b400f6SJacob Faibussowitsch PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 83928b400f6SJacob Faibussowitsch PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 840042217e8SBarry Smith matrixA = (CsrMatrix *)matstructA->mat; 841042217e8SBarry Smith matrixB = (CsrMatrix *)matstructB->mat; 8423fa6b06aSMark Adams if (jacb->compressedrow.use) { 843042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 844042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 845042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 8463fa6b06aSMark Adams } 847042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 848042217e8SBarry Smith } else { 849042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 850042217e8SBarry Smith } 851042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 852042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 853042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 854042217e8SBarry Smith d_mat = spptr->deviceMat; 855042217e8SBarry Smith } 8563fa6b06aSMark Adams if (jaca->compressedrow.use) { 857042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 858042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 859042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 8603fa6b06aSMark Adams } 861042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8623fa6b06aSMark Adams } else { 863042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8643fa6b06aSMark Adams } 865042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 866042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 867042217e8SBarry Smith 868042217e8SBarry Smith if (!d_mat) { 869042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 870042217e8SBarry Smith 871042217e8SBarry Smith // create and populate strucy on host and copy on device 8729566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Create device matrix\n")); 8739566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 8749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 875042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 876042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 87799551766SMark Adams PetscInt *colmap; 878042217e8SBarry Smith PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 879042217e8SBarry Smith 88008401ef6SPierre Jolivet PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 881042217e8SBarry Smith 8829566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(N + 1, &colmap)); 883042217e8SBarry Smith for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 884365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 885365b711fSMark Adams { // have to make a long version of these 88699551766SMark Adams int *h_bi32, *h_bj32; 88799551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 8889566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 8899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 89099551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 8919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 89299551766SMark Adams for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 89399551766SMark Adams 8949566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 8959566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 8969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 8979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 89899551766SMark Adams 89999551766SMark Adams h_mat->offdiag.i = d_bi64; 90099551766SMark Adams h_mat->offdiag.j = d_bj64; 90199551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 90299551766SMark Adams 9039566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 904365b711fSMark Adams } 905365b711fSMark Adams #else 90699551766SMark Adams h_mat->offdiag.i = (PetscInt *)bi; 90799551766SMark Adams h_mat->offdiag.j = (PetscInt *)bj; 90899551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 909365b711fSMark Adams #endif 910042217e8SBarry Smith h_mat->offdiag.a = ba; 911042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 912042217e8SBarry Smith 9139566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 9149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 9159566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 916042217e8SBarry Smith } 917042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 918042217e8SBarry Smith h_mat->rend = A->rmap->rend; 919042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 920042217e8SBarry Smith h_mat->cend = A->cmap->rend; 92149b994a9SMark Adams h_mat->M = A->cmap->N; 922365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 923365b711fSMark Adams { 92499551766SMark Adams int *h_ai32, *h_aj32; 92599551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 92663a3b9bcSJacob Faibussowitsch 92763a3b9bcSJacob Faibussowitsch static_assert(sizeof(PetscInt) == 8, ""); 9289566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 9299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 93099551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 9319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 93299551766SMark Adams for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 93399551766SMark Adams 9349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 9359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 9369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 9379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 93899551766SMark Adams 93999551766SMark Adams h_mat->diag.i = d_ai64; 94099551766SMark Adams h_mat->diag.j = d_aj64; 94199551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 94299551766SMark Adams 9439566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 944365b711fSMark Adams } 945365b711fSMark Adams #else 94699551766SMark Adams h_mat->diag.i = (PetscInt *)ai; 94799551766SMark Adams h_mat->diag.j = (PetscInt *)aj; 94899551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 949365b711fSMark Adams #endif 950042217e8SBarry Smith h_mat->diag.a = aa; 951042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 952042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 953042217e8SBarry Smith // copy pointers and metadata to device 9549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 9559566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 956042217e8SBarry Smith } else { 9579566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Reusing device matrix\n")); 958042217e8SBarry Smith } 959042217e8SBarry Smith *B = d_mat; 960042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9623fa6b06aSMark Adams } 963