1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25cbc6b225SStefano Zampini { 26cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28cbc6b225SStefano Zampini 29cbc6b225SStefano Zampini PetscFunctionBegin; 30cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 31cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 409566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 419566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 429566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 439566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 449566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 45cbc6b225SStefano Zampini } 46cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 47cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 48cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 49cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 50cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 51cbc6b225SStefano Zampini PetscFunctionReturn(0); 52cbc6b225SStefano Zampini } 53cbc6b225SStefano Zampini 54219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 557e8381f9SStefano Zampini { 567e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 577e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 587e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 597e8381f9SStefano Zampini 607e8381f9SStefano Zampini PetscFunctionBegin; 61e61fc153SStefano Zampini if (cusp->coo_p && v) { 6208391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6308391a17SStefano Zampini THRUSTARRAY *w = NULL; 6408391a17SStefano Zampini 6508391a17SStefano Zampini if (isCudaMem(v)) { 6608391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6708391a17SStefano Zampini } else { 68e61fc153SStefano Zampini w = new THRUSTARRAY(n); 69e61fc153SStefano Zampini w->assign(v,v+n); 709566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 7108391a17SStefano Zampini d_v = w->data(); 7208391a17SStefano Zampini } 7308391a17SStefano Zampini 7408391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 757e8381f9SStefano Zampini cusp->coo_pw->begin())); 7608391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 777e8381f9SStefano Zampini cusp->coo_pw->end())); 789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 797e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 81e61fc153SStefano Zampini delete w; 829566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 839566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 847e8381f9SStefano Zampini } else { 859566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 869566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 877e8381f9SStefano Zampini } 887e8381f9SStefano Zampini PetscFunctionReturn(0); 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini 917e8381f9SStefano Zampini template <typename Tuple> 927e8381f9SStefano Zampini struct IsNotOffDiagT 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _cstart,_cend; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline bool operator()(Tuple t) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 1047e8381f9SStefano Zampini struct IsOffDiag 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini PetscInt _cstart,_cend; 1077e8381f9SStefano Zampini 1087e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 1097e8381f9SStefano Zampini __host__ __device__ 1107e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 1117e8381f9SStefano Zampini { 1127e8381f9SStefano Zampini return c < _cstart || c >= _cend; 1137e8381f9SStefano Zampini } 1147e8381f9SStefano Zampini }; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini struct GlobToLoc 1177e8381f9SStefano Zampini { 1187e8381f9SStefano Zampini PetscInt _start; 1197e8381f9SStefano Zampini 1207e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 1217e8381f9SStefano Zampini __host__ __device__ 1227e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 1237e8381f9SStefano Zampini { 1247e8381f9SStefano Zampini return c - _start; 1257e8381f9SStefano Zampini } 1267e8381f9SStefano Zampini }; 1277e8381f9SStefano Zampini 128219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1297e8381f9SStefano Zampini { 1307e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1317e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 13282a78a4eSJed Brown PetscInt N,*jj; 1337e8381f9SStefano Zampini size_t noff = 0; 134ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1357e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1367e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1377e8381f9SStefano Zampini 1387e8381f9SStefano Zampini PetscFunctionBegin; 1399566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->A)); 1409566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 1417e8381f9SStefano Zampini 1429566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 1437e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1447e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1467e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1477e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1487e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1497e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1507e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1517e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1527e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1537e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1547e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1557e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1567e8381f9SStefano Zampini } 1577e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1587e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1597e8381f9SStefano Zampini 1607e8381f9SStefano Zampini /* from global to local */ 1617e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1627e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 1639566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1647e8381f9SStefano Zampini 1657e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 1679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1687e8381f9SStefano Zampini auto o_j = d_j.begin(); 1699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 170ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1717e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 172ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 1739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1747e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 175cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1769566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(noff,&b->garray)); 1779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 1799566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 1809566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 1819566063dSJacob Faibussowitsch PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 1822c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1839566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 1847e8381f9SStefano Zampini 1859566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 1869566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 1879566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 1889566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 1899566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 1909566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 1919566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 1929566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 1937e8381f9SStefano Zampini 1947e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1959566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 1969566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 1979566063dSJacob Faibussowitsch PetscCall(PetscFree(jj)); 1987e8381f9SStefano Zampini 1999566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 2009566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 2017e8381f9SStefano Zampini 2029566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 2039566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 2049566063dSJacob Faibussowitsch PetscCall(MatSetUpMultiply_MPIAIJ(B)); 2057e8381f9SStefano Zampini PetscFunctionReturn(0); 2067e8381f9SStefano Zampini } 2079ae82921SPaul Mullowney 208219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 209219fbbafSJunchao Zhang { 210cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 211219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 212cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 213219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 214219fbbafSJunchao Zhang PetscInt rstart,rend; 215219fbbafSJunchao Zhang 216219fbbafSJunchao Zhang PetscFunctionBegin; 2179566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 2189566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 219cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 2209566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&mpiaij->colmap)); 221cbc6b225SStefano Zampini #else 2229566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 223cbc6b225SStefano Zampini #endif 2249566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 225cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 226cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2279566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 2289566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 229219fbbafSJunchao Zhang if (coo_i) { 2309566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 2319566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 232219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 233219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 234cbc6b225SStefano Zampini if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 235219fbbafSJunchao Zhang } 236219fbbafSJunchao Zhang } 237219fbbafSJunchao Zhang } 238219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2399566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 240219fbbafSJunchao Zhang if (coo_basic) { 2419566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 242219fbbafSJunchao Zhang } else { 2439566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 244cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 245cbc6b225SStefano Zampini /* creates the GPU memory */ 2469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 248cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 249219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 250219fbbafSJunchao Zhang 251158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz+1)*sizeof(PetscCount))); 2529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 253219fbbafSJunchao Zhang 254158ec288SJunchao Zhang PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz+1)*sizeof(PetscCount))); 2559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 256219fbbafSJunchao Zhang 2579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 2589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 2599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 260219fbbafSJunchao Zhang 2619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 2629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 2639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 264219fbbafSJunchao Zhang 2659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 2669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 2679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 268219fbbafSJunchao Zhang 269158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 271219fbbafSJunchao Zhang 272158ec288SJunchao Zhang PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 274219fbbafSJunchao Zhang 2759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2769566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 278219fbbafSJunchao Zhang 2799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2809566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 282219fbbafSJunchao Zhang 2839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284219fbbafSJunchao Zhang } 285219fbbafSJunchao Zhang PetscFunctionReturn(0); 286219fbbafSJunchao Zhang } 287219fbbafSJunchao Zhang 288219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 289219fbbafSJunchao Zhang { 290219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 291219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 292219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 293219fbbafSJunchao Zhang } 294219fbbafSJunchao Zhang 295158ec288SJunchao Zhang __global__ void MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode, 296158ec288SJunchao Zhang PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[], 297158ec288SJunchao Zhang PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[]) 298219fbbafSJunchao Zhang { 299219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 300219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 301158ec288SJunchao Zhang for (; i<Annz+Bnnz; i+= grid_size) { 302158ec288SJunchao Zhang PetscScalar sum = 0.0; 303158ec288SJunchao Zhang if (i<Annz) { 304158ec288SJunchao Zhang for (PetscCount k=Ajmap1[i]; k<Ajmap1[i+1]; k++) sum += kv[Aperm1[k]]; 305158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES? 0.0 : Aa[i]) + sum; 306158ec288SJunchao Zhang } else { 307158ec288SJunchao Zhang i -= Annz; 308158ec288SJunchao Zhang for (PetscCount k=Bjmap1[i]; k<Bjmap1[i+1]; k++) sum += kv[Bperm1[k]]; 309158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES? 0.0 : Ba[i]) + sum; 310158ec288SJunchao Zhang } 311158ec288SJunchao Zhang } 312158ec288SJunchao Zhang } 313158ec288SJunchao Zhang 314158ec288SJunchao Zhang __global__ void MatAddRemoteCOOValues(const PetscScalar kv[], 315158ec288SJunchao Zhang PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[], 316158ec288SJunchao Zhang PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[]) 317158ec288SJunchao Zhang { 318158ec288SJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 319158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 320158ec288SJunchao Zhang for (; i<Annz2+Bnnz2; i+= grid_size) { 321158ec288SJunchao Zhang if (i<Annz2) { 322158ec288SJunchao Zhang for (PetscCount k=Ajmap2[i]; k<Ajmap2[i+1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]]; 323158ec288SJunchao Zhang } else { 324158ec288SJunchao Zhang i -= Annz2; 325158ec288SJunchao Zhang for (PetscCount k=Bjmap2[i]; k<Bjmap2[i+1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]]; 326158ec288SJunchao Zhang } 327158ec288SJunchao Zhang } 328219fbbafSJunchao Zhang } 329219fbbafSJunchao Zhang 330219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 331219fbbafSJunchao Zhang { 332219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 333219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 334219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 335158ec288SJunchao Zhang PetscCount Annz = mpiaij->Annz,Annz2 = mpiaij->Annz2,Bnnz = mpiaij->Bnnz,Bnnz2 = mpiaij->Bnnz2; 336cbc6b225SStefano Zampini PetscScalar *Aa,*Ba = NULL; 337219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 338219fbbafSJunchao Zhang const PetscScalar *v1 = v; 339158ec288SJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap2 = mpidev->Aimap2_d; 340158ec288SJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap2 = mpidev->Bimap2_d; 341219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 342219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 343219fbbafSJunchao Zhang PetscMemType memtype; 344219fbbafSJunchao Zhang 345219fbbafSJunchao Zhang PetscFunctionBegin; 346219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 347cbc6b225SStefano Zampini PetscMPIInt size; 348cbc6b225SStefano Zampini 3499566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 3509566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 351219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3529566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 3539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 354219fbbafSJunchao Zhang } 355219fbbafSJunchao Zhang 356219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 3589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 359219fbbafSJunchao Zhang } else { 3609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 361158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 362219fbbafSJunchao Zhang } 363219fbbafSJunchao Zhang 364219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 365cbc6b225SStefano Zampini if (mpiaij->sendlen) { 366158ec288SJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);PetscCallCUDA(cudaPeekAtLastError()); 367cbc6b225SStefano Zampini } 368219fbbafSJunchao Zhang 369219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 370158ec288SJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 371219fbbafSJunchao Zhang /* Add local entries to A and B */ 372158ec288SJunchao Zhang if (Annz+Bnnz > 0) { 373158ec288SJunchao Zhang MatAddLocalCOOValues<<<(Annz+Bnnz+255)/256,256>>>(v1,imode,Annz,Ajmap1,Aperm1,Aa,Bnnz,Bjmap1,Bperm1,Ba); 3749566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 375cbc6b225SStefano Zampini } 376158ec288SJunchao Zhang PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 377219fbbafSJunchao Zhang 378219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 379158ec288SJunchao Zhang if (Annz2+Bnnz2 > 0) { 380158ec288SJunchao Zhang MatAddRemoteCOOValues<<<(Annz2+Bnnz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 3819566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 382cbc6b225SStefano Zampini } 383219fbbafSJunchao Zhang 384219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 386158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 387219fbbafSJunchao Zhang } else { 3889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 389158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 390219fbbafSJunchao Zhang } 3919566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 392219fbbafSJunchao Zhang } else { 3939566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 394219fbbafSJunchao Zhang } 395cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 396219fbbafSJunchao Zhang PetscFunctionReturn(0); 397219fbbafSJunchao Zhang } 398219fbbafSJunchao Zhang 399ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 400ed502f03SStefano Zampini { 401ed502f03SStefano Zampini Mat Ad,Ao; 402ed502f03SStefano Zampini const PetscInt *cmap; 403ed502f03SStefano Zampini 404ed502f03SStefano Zampini PetscFunctionBegin; 4059566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 4069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 407ed502f03SStefano Zampini if (glob) { 408ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 409ed502f03SStefano Zampini 4109566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 4119566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao,NULL,&on)); 4129566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL)); 4139566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn+on,&gidx)); 414ed502f03SStefano Zampini for (i = 0; i<dn; i++) gidx[i] = cst + i; 415ed502f03SStefano Zampini for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 4169566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 417ed502f03SStefano Zampini } 418ed502f03SStefano Zampini PetscFunctionReturn(0); 419ed502f03SStefano Zampini } 420ed502f03SStefano Zampini 4219ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 4229ae82921SPaul Mullowney { 423bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 424bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 4259ae82921SPaul Mullowney PetscInt i; 4269ae82921SPaul Mullowney 4279ae82921SPaul Mullowney PetscFunctionBegin; 4289566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 4299566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 4307e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 4319ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 43208401ef6SPierre Jolivet PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4339ae82921SPaul Mullowney } 4349ae82921SPaul Mullowney } 4357e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4369ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 43708401ef6SPierre Jolivet PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4389ae82921SPaul Mullowney } 4399ae82921SPaul Mullowney } 4406a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4419566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&b->colmap)); 4426a29ce69SStefano Zampini #else 4439566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 4446a29ce69SStefano Zampini #endif 4459566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 4469566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 4479566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 4486a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4499566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 4506a29ce69SStefano Zampini if (!b->A) { 4519566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 4529566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 4539566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 4546a29ce69SStefano Zampini } 4556a29ce69SStefano Zampini if (!b->B) { 4566a29ce69SStefano Zampini PetscMPIInt size; 4579566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 4589566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 4599566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 4609566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 4619ae82921SPaul Mullowney } 4629566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 4639566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 4649566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 4659566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 4669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 4679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 4689566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 4699566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 4709ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4719ae82921SPaul Mullowney PetscFunctionReturn(0); 4729ae82921SPaul Mullowney } 473e057df02SPaul Mullowney 4749ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4759ae82921SPaul Mullowney { 4769ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4779ae82921SPaul Mullowney 4789ae82921SPaul Mullowney PetscFunctionBegin; 4799566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4809566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A,xx,yy)); 4819566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4829566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 4839ae82921SPaul Mullowney PetscFunctionReturn(0); 4849ae82921SPaul Mullowney } 485ca45077fSPaul Mullowney 4863fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4873fa6b06aSMark Adams { 4883fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4893fa6b06aSMark Adams 4903fa6b06aSMark Adams PetscFunctionBegin; 4919566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 4929566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 4933fa6b06aSMark Adams PetscFunctionReturn(0); 4943fa6b06aSMark Adams } 495042217e8SBarry Smith 496fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 497fdc842d1SBarry Smith { 498fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 499fdc842d1SBarry Smith 500fdc842d1SBarry Smith PetscFunctionBegin; 5019566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 5029566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz)); 5039566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 5049566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 505fdc842d1SBarry Smith PetscFunctionReturn(0); 506fdc842d1SBarry Smith } 507fdc842d1SBarry Smith 508ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 509ca45077fSPaul Mullowney { 510ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 511ca45077fSPaul Mullowney 512ca45077fSPaul Mullowney PetscFunctionBegin; 5139566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 5149566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy)); 5159566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 5169566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 517ca45077fSPaul Mullowney PetscFunctionReturn(0); 518ca45077fSPaul Mullowney } 5199ae82921SPaul Mullowney 520e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 521ca45077fSPaul Mullowney { 522ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 523bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 524e057df02SPaul Mullowney 525ca45077fSPaul Mullowney PetscFunctionBegin; 526e057df02SPaul Mullowney switch (op) { 527e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 528e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 529045c96e1SPaul Mullowney break; 530e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 531e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 532045c96e1SPaul Mullowney break; 533e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 534e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 535e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 536045c96e1SPaul Mullowney break; 537e057df02SPaul Mullowney default: 53898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 539045c96e1SPaul Mullowney } 540ca45077fSPaul Mullowney PetscFunctionReturn(0); 541ca45077fSPaul Mullowney } 542e057df02SPaul Mullowney 5434416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 544e057df02SPaul Mullowney { 545e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 546e057df02SPaul Mullowney PetscBool flg; 547a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 548a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5495fd66863SKarl Rupp 550e057df02SPaul Mullowney PetscFunctionBegin; 551d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject,"MPIAIJCUSPARSE options"); 552e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 5539566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 5545f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5559566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 5569566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5575f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 5589566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 5599566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5605f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5619566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 562e057df02SPaul Mullowney } 563d0609cedSBarry Smith PetscOptionsHeadEnd(); 564e057df02SPaul Mullowney PetscFunctionReturn(0); 565e057df02SPaul Mullowney } 566e057df02SPaul Mullowney 56734d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 56834d6c7a5SJose E. Roman { 5693fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 570042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 571042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 572042217e8SBarry Smith 57334d6c7a5SJose E. Roman PetscFunctionBegin; 5749566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 5759566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA)); 576042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 577042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 578042217e8SBarry Smith 5799566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 5809566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 5829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 58399551766SMark Adams if (h_mat->allocated_indices) { 5849566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5859566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 58699551766SMark Adams if (h_mat->offdiag.j) { 5879566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5889566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 58999551766SMark Adams } 59099551766SMark Adams } 5919566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5929566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 593042217e8SBarry Smith cusp->deviceMat = NULL; 5943fa6b06aSMark Adams } 59534d6c7a5SJose E. Roman PetscFunctionReturn(0); 59634d6c7a5SJose E. Roman } 59734d6c7a5SJose E. Roman 598bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 599bbf3fe20SPaul Mullowney { 6003fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6013fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 602bbf3fe20SPaul Mullowney 603bbf3fe20SPaul Mullowney PetscFunctionBegin; 60428b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6053fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 606042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 607042217e8SBarry Smith 6089566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Have device matrix\n")); 6099566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 6109566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 6119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 61299551766SMark Adams if (h_mat->allocated_indices) { 6139566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 6149566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 61599551766SMark Adams if (h_mat->offdiag.j) { 6169566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 6179566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 61899551766SMark Adams } 61999551766SMark Adams } 6209566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 6219566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 6223fa6b06aSMark Adams } 623cbc6b225SStefano Zampini /* Free COO */ 6249566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 625acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 6269566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 6279566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 6289566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 6299566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 6309566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 6319566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 6329566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 633bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 634bbf3fe20SPaul Mullowney } 635ca45077fSPaul Mullowney 6363338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6379ae82921SPaul Mullowney { 638bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 6393338378cSStefano Zampini Mat A; 6409ae82921SPaul Mullowney 6419ae82921SPaul Mullowney PetscFunctionBegin; 6429566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6439566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 6449566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 6453338378cSStefano Zampini A = *newmat; 6466f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6479566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 6489566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 64934136279SStefano Zampini 650bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 6519566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE)); 6529566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE)); 6539566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA)); 654d98d7c49SStefano Zampini 6553338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 656acbf8a88SJunchao Zhang PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 6573338378cSStefano Zampini } 6582205254eSKarl Rupp 65934d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 660bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 661fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 662bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 663bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 664bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6653fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6664e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 6672205254eSKarl Rupp 6689566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 6699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6719566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6729566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6739566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 674ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6759566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 676ae48a8d0SStefano Zampini #endif 6779ae82921SPaul Mullowney PetscFunctionReturn(0); 6789ae82921SPaul Mullowney } 6799ae82921SPaul Mullowney 6803338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 6813338378cSStefano Zampini { 6823338378cSStefano Zampini PetscFunctionBegin; 6839566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6849566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 6859566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 6863338378cSStefano Zampini PetscFunctionReturn(0); 6873338378cSStefano Zampini } 6883338378cSStefano Zampini 6893f9c0db1SPaul Mullowney /*@ 6903f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 691e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 6923f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 693e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 694e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 695e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 6969ae82921SPaul Mullowney 697d083f849SBarry Smith Collective 698e057df02SPaul Mullowney 699e057df02SPaul Mullowney Input Parameters: 700e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 701e057df02SPaul Mullowney . m - number of rows 702e057df02SPaul Mullowney . n - number of columns 703e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 704e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7050298fd71SBarry Smith (possibly different for each row) or NULL 706e057df02SPaul Mullowney 707e057df02SPaul Mullowney Output Parameter: 708e057df02SPaul Mullowney . A - the matrix 709e057df02SPaul Mullowney 710e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 711e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 712e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 713e057df02SPaul Mullowney 714e057df02SPaul Mullowney Notes: 715e057df02SPaul Mullowney If nnz is given then nz is ignored 716e057df02SPaul Mullowney 717e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 718e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 719e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 720e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 721e057df02SPaul Mullowney 722e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7230298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 724e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 725e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 726e057df02SPaul Mullowney 727e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 728e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 729e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 730e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 731e057df02SPaul Mullowney 732e057df02SPaul Mullowney Level: intermediate 733e057df02SPaul Mullowney 734*db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE` 735e057df02SPaul Mullowney @*/ 7369ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7379ae82921SPaul Mullowney { 7389ae82921SPaul Mullowney PetscMPIInt size; 7399ae82921SPaul Mullowney 7409ae82921SPaul Mullowney PetscFunctionBegin; 7419566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 7429566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,M,N)); 7439566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm,&size)); 7449ae82921SPaul Mullowney if (size > 1) { 7459566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE)); 7469566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 7479ae82921SPaul Mullowney } else { 7489566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 7499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 7509ae82921SPaul Mullowney } 7519ae82921SPaul Mullowney PetscFunctionReturn(0); 7529ae82921SPaul Mullowney } 7539ae82921SPaul Mullowney 7543ca39a21SBarry Smith /*MC 7556bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 756e057df02SPaul Mullowney 7572692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 7582692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 7592692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 7609ae82921SPaul Mullowney 7619ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 7629ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 7639ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 7649ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7659ae82921SPaul Mullowney the above preallocation routines for simplicity. 7669ae82921SPaul Mullowney 7679ae82921SPaul Mullowney Options Database Keys: 768e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 7698468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7708468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7718468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7729ae82921SPaul Mullowney 7739ae82921SPaul Mullowney Level: beginner 7749ae82921SPaul Mullowney 775*db781477SPatrick Sanan .seealso: `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation` 7766bb69460SJunchao Zhang M*/ 7776bb69460SJunchao Zhang 7786bb69460SJunchao Zhang /*MC 7796bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 7806bb69460SJunchao Zhang 7816bb69460SJunchao Zhang Level: beginner 7826bb69460SJunchao Zhang 783*db781477SPatrick Sanan .seealso: `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE` 7849ae82921SPaul Mullowney M*/ 7853fa6b06aSMark Adams 786042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 787042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 7883fa6b06aSMark Adams { 789042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 790042217e8SBarry Smith PetscMPIInt size; 791042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 792042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 79399551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 794042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 795042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 7963fa6b06aSMark Adams 7973fa6b06aSMark Adams PetscFunctionBegin; 79828b400f6SJacob Faibussowitsch PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 799042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8003fa6b06aSMark Adams *B = NULL; 8013fa6b06aSMark Adams PetscFunctionReturn(0); 8023fa6b06aSMark Adams } 8039566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 80499551766SMark Adams // get jaca 8053fa6b06aSMark Adams if (size == 1) { 806042217e8SBarry Smith PetscBool isseqaij; 807042217e8SBarry Smith 8089566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 809042217e8SBarry Smith if (isseqaij) { 8103fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 81128b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 812042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 813042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 8149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 8153fa6b06aSMark Adams } else { 8163fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 81728b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 818042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8193fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 820042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 821042217e8SBarry Smith d_mat = spptr->deviceMat; 8229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8233fa6b06aSMark Adams } 824042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 825042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 82628b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 827042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 828042217e8SBarry Smith bi = NULL; 829042217e8SBarry Smith bj = NULL; 830042217e8SBarry Smith ba = NULL; 831042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 832042217e8SBarry Smith } else { 833042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 83428b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 835042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 83699551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 837042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8383fa6b06aSMark Adams 83908401ef6SPierre Jolivet PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 840042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 841042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 84208401ef6SPierre Jolivet PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 843042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 8449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 846042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 847042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 84828b400f6SJacob Faibussowitsch PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 84928b400f6SJacob Faibussowitsch PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 850042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 851042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8523fa6b06aSMark Adams if (jacb->compressedrow.use) { 853042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 854042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 855042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 8563fa6b06aSMark Adams } 857042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 858042217e8SBarry Smith } else { 859042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 860042217e8SBarry Smith } 861042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 862042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 863042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 864042217e8SBarry Smith d_mat = spptr->deviceMat; 865042217e8SBarry Smith } 8663fa6b06aSMark Adams if (jaca->compressedrow.use) { 867042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 868042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 869042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 8703fa6b06aSMark Adams } 871042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8723fa6b06aSMark Adams } else { 873042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8743fa6b06aSMark Adams } 875042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 876042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 877042217e8SBarry Smith 878042217e8SBarry Smith if (!d_mat) { 879042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 880042217e8SBarry Smith 881042217e8SBarry Smith // create and populate strucy on host and copy on device 8829566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Create device matrix\n")); 8839566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 8849566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 885042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 886042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 88799551766SMark Adams PetscInt *colmap; 888042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 889042217e8SBarry Smith 89008401ef6SPierre Jolivet PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 891042217e8SBarry Smith 8929566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(N+1,&colmap)); 893042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 894365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 895365b711fSMark Adams { // have to make a long version of these 89699551766SMark Adams int *h_bi32, *h_bj32; 89799551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 8989566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 8999566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 90099551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 9019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 90299551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 90399551766SMark Adams 9049566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 9059566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 9069566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 9079566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 90899551766SMark Adams 90999551766SMark Adams h_mat->offdiag.i = d_bi64; 91099551766SMark Adams h_mat->offdiag.j = d_bj64; 91199551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 91299551766SMark Adams 9139566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 914365b711fSMark Adams } 915365b711fSMark Adams #else 91699551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 91799551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 91899551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 919365b711fSMark Adams #endif 920042217e8SBarry Smith h_mat->offdiag.a = ba; 921042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 922042217e8SBarry Smith 9239566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 9249566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 9259566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 926042217e8SBarry Smith } 927042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 928042217e8SBarry Smith h_mat->rend = A->rmap->rend; 929042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 930042217e8SBarry Smith h_mat->cend = A->cmap->rend; 93149b994a9SMark Adams h_mat->M = A->cmap->N; 932365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 933365b711fSMark Adams { 93499551766SMark Adams int *h_ai32, *h_aj32; 93599551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 93663a3b9bcSJacob Faibussowitsch 93763a3b9bcSJacob Faibussowitsch static_assert(sizeof(PetscInt) == 8,""); 9389566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 9399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 94099551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 9419566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 94299551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 94399551766SMark Adams 9449566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 9459566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 9469566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 9479566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 94899551766SMark Adams 94999551766SMark Adams h_mat->diag.i = d_ai64; 95099551766SMark Adams h_mat->diag.j = d_aj64; 95199551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 95299551766SMark Adams 9539566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 954365b711fSMark Adams } 955365b711fSMark Adams #else 95699551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 95799551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 95899551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 959365b711fSMark Adams #endif 960042217e8SBarry Smith h_mat->diag.a = aa; 961042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 962042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 963042217e8SBarry Smith // copy pointers and metadata to device 9649566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 9659566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 966042217e8SBarry Smith } else { 9679566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Reusing device matrix\n")); 968042217e8SBarry Smith } 969042217e8SBarry Smith *B = d_mat; 970042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9713fa6b06aSMark Adams PetscFunctionReturn(0); 9723fa6b06aSMark Adams } 973