1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 149371c9d4SSatish Balay struct VecCUDAEquals { 157e8381f9SStefano Zampini template <typename Tuple> 169371c9d4SSatish Balay __host__ __device__ void operator()(Tuple t) { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 219371c9d4SSatish Balay static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) { 22cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ *)mat->data; 23cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 24cbc6b225SStefano Zampini 25cbc6b225SStefano Zampini PetscFunctionBegin; 26cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 27cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 289566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 299566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 309566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 319566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 409566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 41cbc6b225SStefano Zampini } 42cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 43cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 44cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 45cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 46cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 47cbc6b225SStefano Zampini PetscFunctionReturn(0); 48cbc6b225SStefano Zampini } 49cbc6b225SStefano Zampini 509371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) { 517e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 527e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)a->spptr; 537e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 547e8381f9SStefano Zampini 557e8381f9SStefano Zampini PetscFunctionBegin; 56e61fc153SStefano Zampini if (cusp->coo_p && v) { 5708391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 5808391a17SStefano Zampini THRUSTARRAY *w = NULL; 5908391a17SStefano Zampini 6008391a17SStefano Zampini if (isCudaMem(v)) { 6108391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6208391a17SStefano Zampini } else { 63e61fc153SStefano Zampini w = new THRUSTARRAY(n); 64e61fc153SStefano Zampini w->assign(v, v + n); 659566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n * sizeof(PetscScalar))); 6608391a17SStefano Zampini d_v = w->data(); 6708391a17SStefano Zampini } 6808391a17SStefano Zampini 699371c9d4SSatish Balay auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->begin()), cusp->coo_pw->begin())); 709371c9d4SSatish Balay auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v, cusp->coo_p->end()), cusp->coo_pw->end())); 719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 727e8381f9SStefano Zampini thrust::for_each(zibit, zieit, VecCUDAEquals()); 739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 74e61fc153SStefano Zampini delete w; 759566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, cusp->coo_pw->data().get(), imode)); 769566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, cusp->coo_pw->data().get() + cusp->coo_nd, imode)); 777e8381f9SStefano Zampini } else { 789566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A, v, imode)); 799566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B, v ? v + cusp->coo_nd : NULL, imode)); 807e8381f9SStefano Zampini } 817e8381f9SStefano Zampini PetscFunctionReturn(0); 827e8381f9SStefano Zampini } 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini template <typename Tuple> 859371c9d4SSatish Balay struct IsNotOffDiagT { 867e8381f9SStefano Zampini PetscInt _cstart, _cend; 877e8381f9SStefano Zampini 887e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 899371c9d4SSatish Balay __host__ __device__ inline bool operator()(Tuple t) { return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 929371c9d4SSatish Balay struct IsOffDiag { 937e8381f9SStefano Zampini PetscInt _cstart, _cend; 947e8381f9SStefano Zampini 957e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) { } 969371c9d4SSatish Balay __host__ __device__ inline bool operator()(const PetscInt &c) { return c < _cstart || c >= _cend; } 977e8381f9SStefano Zampini }; 987e8381f9SStefano Zampini 999371c9d4SSatish Balay struct GlobToLoc { 1007e8381f9SStefano Zampini PetscInt _start; 1017e8381f9SStefano Zampini 1027e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) { } 1039371c9d4SSatish Balay __host__ __device__ inline PetscInt operator()(const PetscInt &c) { return c - _start; } 1047e8381f9SStefano Zampini }; 1057e8381f9SStefano Zampini 1069371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, PetscInt coo_i[], PetscInt coo_j[]) { 1077e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 1087e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)b->spptr; 10982a78a4eSJed Brown PetscInt N, *jj; 1107e8381f9SStefano Zampini size_t noff = 0; 111ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini 1157e8381f9SStefano Zampini PetscFunctionBegin; 1169566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->A)); 1179566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 1187e8381f9SStefano Zampini 1199566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2. * n * sizeof(PetscInt))); 1207e8381f9SStefano Zampini d_i.assign(coo_i, coo_i + n); 1217e8381f9SStefano Zampini d_j.assign(coo_j, coo_j + n); 1229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1237e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device, d_j.begin(), d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1247e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device, firstoffd, d_j.end(), IsOffDiag(B->cmap->rstart, B->cmap->rend)); 1257e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1267e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1277e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1287e8381f9SStefano Zampini thrust::sequence(thrust::device, cusp->coo_p->begin(), cusp->coo_p->end(), 0); 1297e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(), d_j.begin(), cusp->coo_p->begin())); 1307e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(), d_j.end(), cusp->coo_p->end())); 1317e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device, fzipp, ezipp, IsNotOffDiagT<thrust::tuple<PetscInt, PetscInt, PetscInt>>(B->cmap->rstart, B->cmap->rend)); 1327e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1337e8381f9SStefano Zampini } 1347e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(), firstoffd); 1357e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd, d_j.end()); 1367e8381f9SStefano Zampini 1377e8381f9SStefano Zampini /* from global to local */ 1387e8381f9SStefano Zampini thrust::transform(thrust::device, d_i.begin(), d_i.end(), d_i.begin(), GlobToLoc(B->rmap->rstart)); 1397e8381f9SStefano Zampini thrust::transform(thrust::device, d_j.begin(), firstoffd, d_j.begin(), GlobToLoc(B->cmap->rstart)); 1409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1417e8381f9SStefano Zampini 1427e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1439566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cusp->coo_no, &jj)); /* jj[] will store compacted col ids of the offdiag part */ 1449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(jj, d_j.data().get() + cusp->coo_nd, cusp->coo_no * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1457e8381f9SStefano Zampini auto o_j = d_j.begin(); 1469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 147ddea5d60SJunchao Zhang thrust::advance(o_j, cusp->coo_nd); /* sort and unique offdiag col ids */ 1487e8381f9SStefano Zampini thrust::sort(thrust::device, o_j, d_j.end()); 149ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device, o_j, d_j.end()); /* return end iter of the unique range */ 1509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1517e8381f9SStefano Zampini noff = thrust::distance(o_j, wit); 152cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1539566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(noff, &b->garray)); 1549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(b->garray, d_j.data().get() + cusp->coo_nd, noff * sizeof(PetscInt), cudaMemcpyDeviceToHost)); 1559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((noff + cusp->coo_no) * sizeof(PetscInt))); 1569566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF, 1, noff, b->garray, PETSC_COPY_VALUES, &l2g)); 1579566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingSetType(l2g, ISLOCALTOGLOBALMAPPINGHASH)); 1589566063dSJacob Faibussowitsch PetscCall(ISGlobalToLocalMappingApply(l2g, IS_GTOLM_DROP, cusp->coo_no, jj, &N, jj)); 1592c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size", N, cusp->coo_no); 1609566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 1617e8381f9SStefano Zampini 1629566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 1639566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 1649566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 1659566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A)); 1669566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 1679566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, noff, B->rmap->n, noff)); 1689566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 1699566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B)); 1707e8381f9SStefano Zampini 1717e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1729566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A, cusp->coo_nd, d_i.data().get(), d_j.data().get())); 1739566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B, cusp->coo_no, d_i.data().get() + cusp->coo_nd, jj)); 1749566063dSJacob Faibussowitsch PetscCall(PetscFree(jj)); 1757e8381f9SStefano Zampini 1769566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusp->diagGPUMatFormat)); 1779566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusp->offdiagGPUMatFormat)); 1787e8381f9SStefano Zampini 1799566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 1809566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 1819566063dSJacob Faibussowitsch PetscCall(MatSetUpMultiply_MPIAIJ(B)); 1827e8381f9SStefano Zampini PetscFunctionReturn(0); 1837e8381f9SStefano Zampini } 1849ae82921SPaul Mullowney 1859371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 186cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data; 187219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 188cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 189219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 190219fbbafSJunchao Zhang PetscInt rstart, rend; 191219fbbafSJunchao Zhang 192219fbbafSJunchao Zhang PetscFunctionBegin; 1939566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 1949566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 195cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 1969566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&mpiaij->colmap)); 197cbc6b225SStefano Zampini #else 1989566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 199cbc6b225SStefano Zampini #endif 2009566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 201cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 202cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2039566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 2049566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 205219fbbafSJunchao Zhang if (coo_i) { 2069566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(mat->rmap, &rstart, &rend)); 2079566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype)); 208219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 209219fbbafSJunchao Zhang for (PetscCount k = 0; k < coo_n; k++) { /* Are there negative indices or remote entries? */ 2109371c9d4SSatish Balay if (coo_i[k] < 0 || coo_i[k] < rstart || coo_i[k] >= rend || coo_j[k] < 0) { 2119371c9d4SSatish Balay coo_basic = PETSC_FALSE; 2129371c9d4SSatish Balay break; 2139371c9d4SSatish Balay } 214219fbbafSJunchao Zhang } 215219fbbafSJunchao Zhang } 216219fbbafSJunchao Zhang } 217219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2189566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE, &coo_basic, 1, MPIU_BOOL, MPI_LAND, PetscObjectComm((PetscObject)mat))); 219219fbbafSJunchao Zhang if (coo_basic) { 2209566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat, coo_n, coo_i, coo_j)); 221219fbbafSJunchao Zhang } else { 2229566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, coo_i, coo_j)); 223cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 224cbc6b225SStefano Zampini /* creates the GPU memory */ 2259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 227cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 228219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 229219fbbafSJunchao Zhang 230158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap1_d, (mpiaij->Annz + 1) * sizeof(PetscCount))); 2319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm1_d, mpiaij->Atot1 * sizeof(PetscCount))); 232219fbbafSJunchao Zhang 233158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap1_d, (mpiaij->Bnnz + 1) * sizeof(PetscCount))); 2349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm1_d, mpiaij->Btot1 * sizeof(PetscCount))); 235219fbbafSJunchao Zhang 2369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aimap2_d, mpiaij->Annz2 * sizeof(PetscCount))); 2379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Ajmap2_d, (mpiaij->Annz2 + 1) * sizeof(PetscCount))); 2389566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Aperm2_d, mpiaij->Atot2 * sizeof(PetscCount))); 239219fbbafSJunchao Zhang 2409566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bimap2_d, mpiaij->Bnnz2 * sizeof(PetscCount))); 2419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bjmap2_d, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount))); 2429566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Bperm2_d, mpiaij->Btot2 * sizeof(PetscCount))); 243219fbbafSJunchao Zhang 2449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->Cperm1_d, mpiaij->sendlen * sizeof(PetscCount))); 2459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->sendbuf_d, mpiaij->sendlen * sizeof(PetscScalar))); 2469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&mpidev->recvbuf_d, mpiaij->recvlen * sizeof(PetscScalar))); 247219fbbafSJunchao Zhang 248158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d, mpiaij->Ajmap1, (mpiaij->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d, mpiaij->Aperm1, mpiaij->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 250219fbbafSJunchao Zhang 251158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d, mpiaij->Bjmap1, (mpiaij->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d, mpiaij->Bperm1, mpiaij->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 253219fbbafSJunchao Zhang 2549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d, mpiaij->Aimap2, mpiaij->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d, mpiaij->Ajmap2, (mpiaij->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2569566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d, mpiaij->Aperm2, mpiaij->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 257219fbbafSJunchao Zhang 2589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d, mpiaij->Bimap2, mpiaij->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d, mpiaij->Bjmap2, (mpiaij->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice)); 2609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d, mpiaij->Bperm2, mpiaij->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice)); 261219fbbafSJunchao Zhang 2629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d, mpiaij->Cperm1, mpiaij->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice)); 263219fbbafSJunchao Zhang } 264219fbbafSJunchao Zhang PetscFunctionReturn(0); 265219fbbafSJunchao Zhang } 266219fbbafSJunchao Zhang 2679371c9d4SSatish Balay __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[]) { 268219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 269219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 270219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]]; 271219fbbafSJunchao Zhang } 272219fbbafSJunchao Zhang 2739371c9d4SSatish Balay __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[]) { 274219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 275219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 276158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) { 277158ec288SJunchao Zhang PetscScalar sum = 0.0; 278158ec288SJunchao Zhang if (i < Annz) { 279158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]]; 280158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum; 281158ec288SJunchao Zhang } else { 282158ec288SJunchao Zhang i -= Annz; 283158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]]; 284158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum; 285158ec288SJunchao Zhang } 286158ec288SJunchao Zhang } 287158ec288SJunchao Zhang } 288158ec288SJunchao Zhang 2899371c9d4SSatish Balay __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[]) { 290158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x; 291158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 292158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) { 293158ec288SJunchao Zhang if (i < Annz2) { 294158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 295158ec288SJunchao Zhang } else { 296158ec288SJunchao Zhang i -= Annz2; 297158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 298158ec288SJunchao Zhang } 299158ec288SJunchao Zhang } 300219fbbafSJunchao Zhang } 301219fbbafSJunchao Zhang 3029371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode) { 303219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data); 304219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE *>(mpiaij->spptr); 305219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B; 306158ec288SJunchao Zhang PetscCount Annz = mpiaij->Annz, Annz2 = mpiaij->Annz2, Bnnz = mpiaij->Bnnz, Bnnz2 = mpiaij->Bnnz2; 307cbc6b225SStefano Zampini PetscScalar *Aa, *Ba = NULL; 308219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d, *v2 = mpidev->recvbuf_d; 309219fbbafSJunchao Zhang const PetscScalar *v1 = v; 310158ec288SJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d, *Ajmap2 = mpidev->Ajmap2_d, *Aimap2 = mpidev->Aimap2_d; 311158ec288SJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d, *Bjmap2 = mpidev->Bjmap2_d, *Bimap2 = mpidev->Bimap2_d; 312219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d, *Aperm2 = mpidev->Aperm2_d, *Bperm1 = mpidev->Bperm1_d, *Bperm2 = mpidev->Bperm2_d; 313219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 314219fbbafSJunchao Zhang PetscMemType memtype; 315219fbbafSJunchao Zhang 316219fbbafSJunchao Zhang PetscFunctionBegin; 317219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 318cbc6b225SStefano Zampini PetscMPIInt size; 319cbc6b225SStefano Zampini 3209566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat), &size)); 3219566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 322219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&v1, mpiaij->coo_n * sizeof(PetscScalar))); 3249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void *)v1, v, mpiaij->coo_n * sizeof(PetscScalar), cudaMemcpyHostToDevice)); 325219fbbafSJunchao Zhang } 326219fbbafSJunchao Zhang 327219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */ 3299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba)); 330219fbbafSJunchao Zhang } else { 3319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */ 332158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba)); 333219fbbafSJunchao Zhang } 334219fbbafSJunchao Zhang 335219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 336cbc6b225SStefano Zampini if (mpiaij->sendlen) { 3379371c9d4SSatish Balay MatPackCOOValues<<<(mpiaij->sendlen + 255) / 256, 256>>>(v1, mpiaij->sendlen, Cperm1, vsend); 3389371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError()); 339cbc6b225SStefano Zampini } 340219fbbafSJunchao Zhang 341219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 342158ec288SJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE)); 343219fbbafSJunchao Zhang /* Add local entries to A and B */ 344158ec288SJunchao Zhang if (Annz + Bnnz > 0) { 345158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba); 3469566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 347cbc6b225SStefano Zampini } 348158ec288SJunchao Zhang PetscCall(PetscSFReduceEnd(mpiaij->coo_sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE)); 349219fbbafSJunchao Zhang 350219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 351158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) { 352158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba); 3539566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 354cbc6b225SStefano Zampini } 355219fbbafSJunchao Zhang 356219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa)); 358158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba)); 359219fbbafSJunchao Zhang } else { 3609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa)); 361158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba)); 362219fbbafSJunchao Zhang } 3639566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1)); 364219fbbafSJunchao Zhang } else { 3659566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat, v, imode)); 366219fbbafSJunchao Zhang } 367cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 368219fbbafSJunchao Zhang PetscFunctionReturn(0); 369219fbbafSJunchao Zhang } 370219fbbafSJunchao Zhang 3719371c9d4SSatish Balay static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc) { 372ed502f03SStefano Zampini Mat Ad, Ao; 373ed502f03SStefano Zampini const PetscInt *cmap; 374ed502f03SStefano Zampini 375ed502f03SStefano Zampini PetscFunctionBegin; 3769566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap)); 3779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc)); 378ed502f03SStefano Zampini if (glob) { 379ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 380ed502f03SStefano Zampini 3819566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn)); 3829566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on)); 3839566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL)); 3849566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx)); 385ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i; 386ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i]; 3879566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob)); 388ed502f03SStefano Zampini } 389ed502f03SStefano Zampini PetscFunctionReturn(0); 390ed502f03SStefano Zampini } 391ed502f03SStefano Zampini 3929371c9d4SSatish Balay PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[]) { 393bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data; 394bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr; 3959ae82921SPaul Mullowney PetscInt i; 3969ae82921SPaul Mullowney 3979ae82921SPaul Mullowney PetscFunctionBegin; 3989566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 3999566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 4007e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 4019371c9d4SSatish Balay for (i = 0; i < B->rmap->n; i++) { PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]); } 4029ae82921SPaul Mullowney } 4037e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4049371c9d4SSatish Balay for (i = 0; i < B->rmap->n; i++) { PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]); } 4059ae82921SPaul Mullowney } 4066a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4079566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&b->colmap)); 4086a29ce69SStefano Zampini #else 4099566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 4106a29ce69SStefano Zampini #endif 4119566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 4129566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 4139566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 4146a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4159566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 4166a29ce69SStefano Zampini if (!b->A) { 4179566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A)); 4189566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n)); 4199566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->A)); 4206a29ce69SStefano Zampini } 4216a29ce69SStefano Zampini if (!b->B) { 4226a29ce69SStefano Zampini PetscMPIInt size; 4239566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size)); 4249566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B)); 4259566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0)); 4269566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B, (PetscObject)b->B)); 4279ae82921SPaul Mullowney } 4289566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE)); 4299566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE)); 4309566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu)); 4319566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu)); 4329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz)); 4339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz)); 4349566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat)); 4359566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat)); 4369ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4379ae82921SPaul Mullowney PetscFunctionReturn(0); 4389ae82921SPaul Mullowney } 439e057df02SPaul Mullowney 4409371c9d4SSatish Balay PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 4419ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 4429ae82921SPaul Mullowney 4439ae82921SPaul Mullowney PetscFunctionBegin; 4449566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4459566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy)); 4469566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4479566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy)); 4489ae82921SPaul Mullowney PetscFunctionReturn(0); 4499ae82921SPaul Mullowney } 450ca45077fSPaul Mullowney 4519371c9d4SSatish Balay PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) { 4523fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data; 4533fa6b06aSMark Adams 4543fa6b06aSMark Adams PetscFunctionBegin; 4559566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 4569566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 4573fa6b06aSMark Adams PetscFunctionReturn(0); 4583fa6b06aSMark Adams } 459042217e8SBarry Smith 4609371c9d4SSatish Balay PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz) { 461fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 462fdc842d1SBarry Smith 463fdc842d1SBarry Smith PetscFunctionBegin; 4649566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4659566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz)); 4669566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD)); 4679566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz)); 468fdc842d1SBarry Smith PetscFunctionReturn(0); 469fdc842d1SBarry Smith } 470fdc842d1SBarry Smith 4719371c9d4SSatish Balay PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy) { 472ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 473ca45077fSPaul Mullowney 474ca45077fSPaul Mullowney PetscFunctionBegin; 4759566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec)); 4769566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy)); 4779566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 4789566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE)); 479ca45077fSPaul Mullowney PetscFunctionReturn(0); 480ca45077fSPaul Mullowney } 4819ae82921SPaul Mullowney 4829371c9d4SSatish Balay PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format) { 483ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 484bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 485e057df02SPaul Mullowney 486ca45077fSPaul Mullowney PetscFunctionBegin; 487e057df02SPaul Mullowney switch (op) { 4889371c9d4SSatish Balay case MAT_CUSPARSE_MULT_DIAG: cusparseStruct->diagGPUMatFormat = format; break; 4899371c9d4SSatish Balay case MAT_CUSPARSE_MULT_OFFDIAG: cusparseStruct->offdiagGPUMatFormat = format; break; 490e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 491e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 492e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 493045c96e1SPaul Mullowney break; 4949371c9d4SSatish Balay default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op); 495045c96e1SPaul Mullowney } 496ca45077fSPaul Mullowney PetscFunctionReturn(0); 497ca45077fSPaul Mullowney } 498e057df02SPaul Mullowney 4999371c9d4SSatish Balay PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject) { 500e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 501e057df02SPaul Mullowney PetscBool flg; 502a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data; 503a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr; 5045fd66863SKarl Rupp 505e057df02SPaul Mullowney PetscFunctionBegin; 506d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options"); 507e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) { 5089371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5099566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format)); 5109371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg)); 5119566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format)); 5129371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg)); 5139566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format)); 514e057df02SPaul Mullowney } 515d0609cedSBarry Smith PetscOptionsHeadEnd(); 516e057df02SPaul Mullowney PetscFunctionReturn(0); 517e057df02SPaul Mullowney } 518e057df02SPaul Mullowney 5199371c9d4SSatish Balay PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode) { 5203fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data; 521042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE *)mpiaij->spptr; 522042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 523042217e8SBarry Smith 52434d6c7a5SJose E. Roman PetscFunctionBegin; 5259566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode)); 5269566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA)); 527042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 528042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 529042217e8SBarry Smith 5309566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Destroy device mat since nonzerostate changed\n")); 5319566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5329566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 53499551766SMark Adams if (h_mat->allocated_indices) { 5359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 53799551766SMark Adams if (h_mat->offdiag.j) { 5389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 54099551766SMark Adams } 54199551766SMark Adams } 5429566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5439566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 544042217e8SBarry Smith cusp->deviceMat = NULL; 5453fa6b06aSMark Adams } 54634d6c7a5SJose E. Roman PetscFunctionReturn(0); 54734d6c7a5SJose E. Roman } 54834d6c7a5SJose E. Roman 5499371c9d4SSatish Balay PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) { 5503fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 5513fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr; 552bbf3fe20SPaul Mullowney 553bbf3fe20SPaul Mullowney PetscFunctionBegin; 55428b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr"); 5553fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 556042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 557042217e8SBarry Smith 5589566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Have device matrix\n")); 5599566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5609566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat, d_mat, sizeof(*d_mat), cudaMemcpyDeviceToHost)); 5619566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 56299551766SMark Adams if (h_mat->allocated_indices) { 5639566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5649566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 56599551766SMark Adams if (h_mat->offdiag.j) { 5669566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5679566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 56899551766SMark Adams } 56999551766SMark Adams } 5709566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5719566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 5723fa6b06aSMark Adams } 573cbc6b225SStefano Zampini /* Free COO */ 5749566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 575acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 5769566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL)); 5779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL)); 5789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 5799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 5809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL)); 5819566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL)); 5829566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 583bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 584bbf3fe20SPaul Mullowney } 585ca45077fSPaul Mullowney 5869371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat *newmat) { 587bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 5883338378cSStefano Zampini Mat A; 5899ae82921SPaul Mullowney 5909ae82921SPaul Mullowney PetscFunctionBegin; 5919566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 5929566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat)); 5939566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN)); 5943338378cSStefano Zampini A = *newmat; 5956f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 5969566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5979566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype)); 59834136279SStefano Zampini 599bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data; 6009566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE)); 6019566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE)); 6029566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA)); 603d98d7c49SStefano Zampini 604*48a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 6052205254eSKarl Rupp 60634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 607bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 608fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 609bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 610bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 611bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6123fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6134e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 6142205254eSKarl Rupp 6159566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE)); 6169566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6189566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6199566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE)); 621ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6229566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE)); 623ae48a8d0SStefano Zampini #endif 6249ae82921SPaul Mullowney PetscFunctionReturn(0); 6259ae82921SPaul Mullowney } 6269ae82921SPaul Mullowney 6279371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) { 6283338378cSStefano Zampini PetscFunctionBegin; 6299566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6309566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 6319566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A)); 6323338378cSStefano Zampini PetscFunctionReturn(0); 6333338378cSStefano Zampini } 6343338378cSStefano Zampini 6353f9c0db1SPaul Mullowney /*@ 6363f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 637e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 6383f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 639e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 640e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 641e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 6429ae82921SPaul Mullowney 643d083f849SBarry Smith Collective 644e057df02SPaul Mullowney 645e057df02SPaul Mullowney Input Parameters: 646e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 647e057df02SPaul Mullowney . m - number of rows 648e057df02SPaul Mullowney . n - number of columns 649e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 650e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 6510298fd71SBarry Smith (possibly different for each row) or NULL 652e057df02SPaul Mullowney 653e057df02SPaul Mullowney Output Parameter: 654e057df02SPaul Mullowney . A - the matrix 655e057df02SPaul Mullowney 656e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 657e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 658e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 659e057df02SPaul Mullowney 660e057df02SPaul Mullowney Notes: 661e057df02SPaul Mullowney If nnz is given then nz is ignored 662e057df02SPaul Mullowney 663e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 664e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 665e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 666e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 667e057df02SPaul Mullowney 668e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 6690298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 670e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 671e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 672e057df02SPaul Mullowney 673e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 674e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 675e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 676e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 677e057df02SPaul Mullowney 678e057df02SPaul Mullowney Level: intermediate 679e057df02SPaul Mullowney 680db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 681e057df02SPaul Mullowney @*/ 6829371c9d4SSatish Balay PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A) { 6839ae82921SPaul Mullowney PetscMPIInt size; 6849ae82921SPaul Mullowney 6859ae82921SPaul Mullowney PetscFunctionBegin; 6869566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 6879566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N)); 6889566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size)); 6899ae82921SPaul Mullowney if (size > 1) { 6909566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE)); 6919566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz)); 6929ae82921SPaul Mullowney } else { 6939566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE)); 6949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz)); 6959ae82921SPaul Mullowney } 6969ae82921SPaul Mullowney PetscFunctionReturn(0); 6979ae82921SPaul Mullowney } 6989ae82921SPaul Mullowney 6993ca39a21SBarry Smith /*MC 7006bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 701e057df02SPaul Mullowney 7022692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 7032692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 7042692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 7059ae82921SPaul Mullowney 7069ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 7079ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 7089ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 7099ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7109ae82921SPaul Mullowney the above preallocation routines for simplicity. 7119ae82921SPaul Mullowney 7129ae82921SPaul Mullowney Options Database Keys: 713e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 7148468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7158468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7168468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7179ae82921SPaul Mullowney 7189ae82921SPaul Mullowney Level: beginner 7199ae82921SPaul Mullowney 720db781477SPatrick Sanan .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 7216bb69460SJunchao Zhang M*/ 7226bb69460SJunchao Zhang 7236bb69460SJunchao Zhang /*MC 7246bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 7256bb69460SJunchao Zhang 7266bb69460SJunchao Zhang Level: beginner 7276bb69460SJunchao Zhang 728db781477SPatrick Sanan .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 7299ae82921SPaul Mullowney M*/ 7303fa6b06aSMark Adams 731042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 7329371c9d4SSatish Balay PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) { 733042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 734042217e8SBarry Smith PetscMPIInt size; 735042217e8SBarry Smith int *ai = NULL, *bi = NULL, *aj = NULL, *bj = NULL; 736042217e8SBarry Smith PetscScalar *aa = NULL, *ba = NULL; 73799551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 738042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 739042217e8SBarry Smith CsrMatrix *matrixA = NULL, *matrixB = NULL; 7403fa6b06aSMark Adams 7413fa6b06aSMark Adams PetscFunctionBegin; 74228b400f6SJacob Faibussowitsch PetscCheck(A->assembled, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Need already assembled matrix"); 743042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 7443fa6b06aSMark Adams *B = NULL; 7453fa6b06aSMark Adams PetscFunctionReturn(0); 7463fa6b06aSMark Adams } 7479566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A), &size)); 74899551766SMark Adams // get jaca 7493fa6b06aSMark Adams if (size == 1) { 750042217e8SBarry Smith PetscBool isseqaij; 751042217e8SBarry Smith 7529566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)A, MATSEQAIJ, &isseqaij)); 753042217e8SBarry Smith if (isseqaij) { 7543fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)A->data; 75528b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 756042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)A->spptr; 757042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 7589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 7593fa6b06aSMark Adams } else { 7603fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 76128b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 762042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 7633fa6b06aSMark Adams jaca = (Mat_SeqAIJ *)aij->A->data; 764042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 765042217e8SBarry Smith d_mat = spptr->deviceMat; 7669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 7673fa6b06aSMark Adams } 768042217e8SBarry Smith if (cusparsestructA->format == MAT_CUSPARSE_CSR) { 769042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 77028b400f6SJacob Faibussowitsch PetscCheck(matstruct, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 771042217e8SBarry Smith matrixA = (CsrMatrix *)matstruct->mat; 772042217e8SBarry Smith bi = NULL; 773042217e8SBarry Smith bj = NULL; 774042217e8SBarry Smith ba = NULL; 775042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat needs MAT_CUSPARSE_CSR"); 776042217e8SBarry Smith } else { 777042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 77828b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support column oriented values insertion"); 779042217e8SBarry Smith jaca = (Mat_SeqAIJ *)aij->A->data; 78099551766SMark Adams jacb = (Mat_SeqAIJ *)aij->B->data; 781042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE *)aij->spptr; 7823fa6b06aSMark Adams 78308401ef6SPierre Jolivet PetscCheck(A->nooffprocentries || aij->donotstash, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 784042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE *)aij->A->spptr; 785042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE *)aij->B->spptr; 78608401ef6SPierre Jolivet PetscCheck(cusparsestructA->format == MAT_CUSPARSE_CSR, PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat A needs MAT_CUSPARSE_CSR"); 787042217e8SBarry Smith if (cusparsestructB->format == MAT_CUSPARSE_CSR) { 7889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 7899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 790042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructA->mat; 791042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct *)cusparsestructB->mat; 79228b400f6SJacob Faibussowitsch PetscCheck(matstructA, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 79328b400f6SJacob Faibussowitsch PetscCheck(matstructB, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 794042217e8SBarry Smith matrixA = (CsrMatrix *)matstructA->mat; 795042217e8SBarry Smith matrixB = (CsrMatrix *)matstructB->mat; 7963fa6b06aSMark Adams if (jacb->compressedrow.use) { 797042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 798042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 799042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i, jacb->i + A->rmap->n + 1); 8003fa6b06aSMark Adams } 801042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 802042217e8SBarry Smith } else { 803042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 804042217e8SBarry Smith } 805042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 806042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 807042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Device Mat B needs MAT_CUSPARSE_CSR"); 808042217e8SBarry Smith d_mat = spptr->deviceMat; 809042217e8SBarry Smith } 8103fa6b06aSMark Adams if (jaca->compressedrow.use) { 811042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 812042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n + 1); 813042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i, jaca->i + A->rmap->n + 1); 8143fa6b06aSMark Adams } 815042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8163fa6b06aSMark Adams } else { 817042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8183fa6b06aSMark Adams } 819042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 820042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 821042217e8SBarry Smith 822042217e8SBarry Smith if (!d_mat) { 823042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 824042217e8SBarry Smith 825042217e8SBarry Smith // create and populate strucy on host and copy on device 8269566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Create device matrix\n")); 8279566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 8289566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_mat, sizeof(*d_mat))); 829042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 830042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data; 83199551766SMark Adams PetscInt *colmap; 832042217e8SBarry Smith PetscInt ii, n = aij->B->cmap->n, N = A->cmap->N; 833042217e8SBarry Smith 83408401ef6SPierre Jolivet PetscCheck(!n || aij->garray, PETSC_COMM_SELF, PETSC_ERR_PLIB, "MPIAIJ Matrix was assembled but is missing garray"); 835042217e8SBarry Smith 8369566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(N + 1, &colmap)); 837042217e8SBarry Smith for (ii = 0; ii < n; ii++) colmap[aij->garray[ii]] = (int)(ii + 1); 838365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 839365b711fSMark Adams { // have to make a long version of these 84099551766SMark Adams int *h_bi32, *h_bj32; 84199551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 8429566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_bi32, jacb->nz, &h_bj32, A->rmap->n + 1, &h_bi64, jacb->nz, &h_bj64)); 8439566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n + 1) * sizeof(*h_bi32), cudaMemcpyDeviceToHost)); 84499551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_bi64[i] = h_bi32[i]; 8459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz * sizeof(*h_bj32), cudaMemcpyDeviceToHost)); 84699551766SMark Adams for (int i = 0; i < jacb->nz; i++) h_bj64[i] = h_bj32[i]; 84799551766SMark Adams 8489566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bi64, (A->rmap->n + 1) * sizeof(*d_bi64))); 8499566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64, (A->rmap->n + 1) * sizeof(*d_bi64), cudaMemcpyHostToDevice)); 8509566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_bj64, jacb->nz * sizeof(*d_bj64))); 8519566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64, jacb->nz * sizeof(*d_bj64), cudaMemcpyHostToDevice)); 85299551766SMark Adams 85399551766SMark Adams h_mat->offdiag.i = d_bi64; 85499551766SMark Adams h_mat->offdiag.j = d_bj64; 85599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 85699551766SMark Adams 8579566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_bi32, h_bj32, h_bi64, h_bj64)); 858365b711fSMark Adams } 859365b711fSMark Adams #else 86099551766SMark Adams h_mat->offdiag.i = (PetscInt *)bi; 86199551766SMark Adams h_mat->offdiag.j = (PetscInt *)bj; 86299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 863365b711fSMark Adams #endif 864042217e8SBarry Smith h_mat->offdiag.a = ba; 865042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 866042217e8SBarry Smith 8679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&h_mat->colmap, (N + 1) * sizeof(*h_mat->colmap))); 8689566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat->colmap, colmap, (N + 1) * sizeof(*h_mat->colmap), cudaMemcpyHostToDevice)); 8699566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 870042217e8SBarry Smith } 871042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 872042217e8SBarry Smith h_mat->rend = A->rmap->rend; 873042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 874042217e8SBarry Smith h_mat->cend = A->cmap->rend; 87549b994a9SMark Adams h_mat->M = A->cmap->N; 876365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 877365b711fSMark Adams { 87899551766SMark Adams int *h_ai32, *h_aj32; 87999551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 88063a3b9bcSJacob Faibussowitsch 88163a3b9bcSJacob Faibussowitsch static_assert(sizeof(PetscInt) == 8, ""); 8829566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n + 1, &h_ai32, jaca->nz, &h_aj32, A->rmap->n + 1, &h_ai64, jaca->nz, &h_aj64)); 8839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n + 1) * sizeof(*h_ai32), cudaMemcpyDeviceToHost)); 88499551766SMark Adams for (int i = 0; i < A->rmap->n + 1; i++) h_ai64[i] = h_ai32[i]; 8859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz * sizeof(*h_aj32), cudaMemcpyDeviceToHost)); 88699551766SMark Adams for (int i = 0; i < jaca->nz; i++) h_aj64[i] = h_aj32[i]; 88799551766SMark Adams 8889566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_ai64, (A->rmap->n + 1) * sizeof(*d_ai64))); 8899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64, (A->rmap->n + 1) * sizeof(*d_ai64), cudaMemcpyHostToDevice)); 8909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void **)&d_aj64, jaca->nz * sizeof(*d_aj64))); 8919566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64, jaca->nz * sizeof(*d_aj64), cudaMemcpyHostToDevice)); 89299551766SMark Adams 89399551766SMark Adams h_mat->diag.i = d_ai64; 89499551766SMark Adams h_mat->diag.j = d_aj64; 89599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 89699551766SMark Adams 8979566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_ai32, h_aj32, h_ai64, h_aj64)); 898365b711fSMark Adams } 899365b711fSMark Adams #else 90099551766SMark Adams h_mat->diag.i = (PetscInt *)ai; 90199551766SMark Adams h_mat->diag.j = (PetscInt *)aj; 90299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 903365b711fSMark Adams #endif 904042217e8SBarry Smith h_mat->diag.a = aa; 905042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 906042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 907042217e8SBarry Smith // copy pointers and metadata to device 9089566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_mat, h_mat, sizeof(*d_mat), cudaMemcpyHostToDevice)); 9099566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 910042217e8SBarry Smith } else { 9119566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Reusing device matrix\n")); 912042217e8SBarry Smith } 913042217e8SBarry Smith *B = d_mat; 914042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9153fa6b06aSMark Adams PetscFunctionReturn(0); 9163fa6b06aSMark Adams } 917