1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25cbc6b225SStefano Zampini { 26cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28cbc6b225SStefano Zampini 29cbc6b225SStefano Zampini PetscFunctionBegin; 30cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 31cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 329566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap1_d)); 339566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 349566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm1_d)); 359566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap1_d)); 369566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 379566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm1_d)); 389566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aimap2_d)); 399566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 409566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Aperm2_d)); 419566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bimap2_d)); 429566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 439566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Bperm2_d)); 449566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->Cperm1_d)); 459566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->sendbuf_d)); 469566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(cusparseStruct->recvbuf_d)); 47cbc6b225SStefano Zampini } 48cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 49cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 50cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 51cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 52cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 53cbc6b225SStefano Zampini PetscFunctionReturn(0); 54cbc6b225SStefano Zampini } 55cbc6b225SStefano Zampini 56219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 577e8381f9SStefano Zampini { 587e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 597e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 607e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 617e8381f9SStefano Zampini 627e8381f9SStefano Zampini PetscFunctionBegin; 63e61fc153SStefano Zampini if (cusp->coo_p && v) { 6408391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6508391a17SStefano Zampini THRUSTARRAY *w = NULL; 6608391a17SStefano Zampini 6708391a17SStefano Zampini if (isCudaMem(v)) { 6808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6908391a17SStefano Zampini } else { 70e61fc153SStefano Zampini w = new THRUSTARRAY(n); 71e61fc153SStefano Zampini w->assign(v,v+n); 729566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 7308391a17SStefano Zampini d_v = w->data(); 7408391a17SStefano Zampini } 7508391a17SStefano Zampini 7608391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 777e8381f9SStefano Zampini cusp->coo_pw->begin())); 7808391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 797e8381f9SStefano Zampini cusp->coo_pw->end())); 809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 817e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 83e61fc153SStefano Zampini delete w; 849566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 859566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 867e8381f9SStefano Zampini } else { 879566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 889566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini PetscFunctionReturn(0); 917e8381f9SStefano Zampini } 927e8381f9SStefano Zampini 937e8381f9SStefano Zampini template <typename Tuple> 947e8381f9SStefano Zampini struct IsNotOffDiagT 957e8381f9SStefano Zampini { 967e8381f9SStefano Zampini PetscInt _cstart,_cend; 977e8381f9SStefano Zampini 987e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 997e8381f9SStefano Zampini __host__ __device__ 1007e8381f9SStefano Zampini inline bool operator()(Tuple t) 1017e8381f9SStefano Zampini { 1027e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 1037e8381f9SStefano Zampini } 1047e8381f9SStefano Zampini }; 1057e8381f9SStefano Zampini 1067e8381f9SStefano Zampini struct IsOffDiag 1077e8381f9SStefano Zampini { 1087e8381f9SStefano Zampini PetscInt _cstart,_cend; 1097e8381f9SStefano Zampini 1107e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 1117e8381f9SStefano Zampini __host__ __device__ 1127e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 1137e8381f9SStefano Zampini { 1147e8381f9SStefano Zampini return c < _cstart || c >= _cend; 1157e8381f9SStefano Zampini } 1167e8381f9SStefano Zampini }; 1177e8381f9SStefano Zampini 1187e8381f9SStefano Zampini struct GlobToLoc 1197e8381f9SStefano Zampini { 1207e8381f9SStefano Zampini PetscInt _start; 1217e8381f9SStefano Zampini 1227e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 1237e8381f9SStefano Zampini __host__ __device__ 1247e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 1257e8381f9SStefano Zampini { 1267e8381f9SStefano Zampini return c - _start; 1277e8381f9SStefano Zampini } 1287e8381f9SStefano Zampini }; 1297e8381f9SStefano Zampini 130219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1317e8381f9SStefano Zampini { 1327e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1337e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 13482a78a4eSJed Brown PetscInt N,*jj; 1357e8381f9SStefano Zampini size_t noff = 0; 136ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1377e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1387e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1397e8381f9SStefano Zampini 1407e8381f9SStefano Zampini PetscFunctionBegin; 1419566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->A)); 1429566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 1437e8381f9SStefano Zampini 1449566063dSJacob Faibussowitsch PetscCall(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 1457e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1467e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1487e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1497e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1507e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1517e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1527e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1537e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1547e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1557e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1567e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1577e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1587e8381f9SStefano Zampini } 1597e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1607e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1617e8381f9SStefano Zampini 1627e8381f9SStefano Zampini /* from global to local */ 1637e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1647e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 1659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1667e8381f9SStefano Zampini 1677e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 1699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1707e8381f9SStefano Zampini auto o_j = d_j.begin(); 1719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 172ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1737e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 174ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 1759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 1767e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 177cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1789566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(noff,&b->garray)); 1799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 1819566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 1829566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 1839566063dSJacob Faibussowitsch PetscCall(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 1842c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1859566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2g)); 1867e8381f9SStefano Zampini 1879566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 1889566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 1899566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 1909566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 1919566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 1929566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 1939566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 1949566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 1957e8381f9SStefano Zampini 1967e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1979566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 1989566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 1999566063dSJacob Faibussowitsch PetscCall(PetscFree(jj)); 2007e8381f9SStefano Zampini 2019566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 2029566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 2037e8381f9SStefano Zampini 2049566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 2059566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 2069566063dSJacob Faibussowitsch PetscCall(MatSetUpMultiply_MPIAIJ(B)); 2077e8381f9SStefano Zampini PetscFunctionReturn(0); 2087e8381f9SStefano Zampini } 2099ae82921SPaul Mullowney 210219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 211219fbbafSJunchao Zhang { 212cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 213219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 214cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 215219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 216219fbbafSJunchao Zhang PetscInt rstart,rend; 217219fbbafSJunchao Zhang 218219fbbafSJunchao Zhang PetscFunctionBegin; 2199566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 2209566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 221cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 2229566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&mpiaij->colmap)); 223cbc6b225SStefano Zampini #else 2249566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 225cbc6b225SStefano Zampini #endif 2269566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 227cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 228cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2299566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJ(mat)); 2309566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 231219fbbafSJunchao Zhang if (coo_i) { 2329566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 2339566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i,&mtype)); 234219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 235219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 236cbc6b225SStefano Zampini if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 237219fbbafSJunchao Zhang } 238219fbbafSJunchao Zhang } 239219fbbafSJunchao Zhang } 240219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2419566063dSJacob Faibussowitsch PetscCallMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 242219fbbafSJunchao Zhang if (coo_basic) { 2439566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 244219fbbafSJunchao Zhang } else { 2459566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 246cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 247cbc6b225SStefano Zampini /* creates the GPU memory */ 2489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 250cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 251219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 252219fbbafSJunchao Zhang 2539566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount))); 2549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount))); 2559566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 256219fbbafSJunchao Zhang 2579566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount))); 2589566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount))); 2599566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 260219fbbafSJunchao Zhang 2619566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 2629566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 2639566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 264219fbbafSJunchao Zhang 2659566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 2669566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 2679566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 268219fbbafSJunchao Zhang 2699566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 2709566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 2719566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 272219fbbafSJunchao Zhang 2739566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2749566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 276219fbbafSJunchao Zhang 2779566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2789566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2799566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 280219fbbafSJunchao Zhang 2819566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2829566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2839566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284219fbbafSJunchao Zhang 2859566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2869566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2879566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 288219fbbafSJunchao Zhang 2899566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 290219fbbafSJunchao Zhang } 291219fbbafSJunchao Zhang PetscFunctionReturn(0); 292219fbbafSJunchao Zhang } 293219fbbafSJunchao Zhang 294219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 295219fbbafSJunchao Zhang { 296219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 297219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 298219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 299219fbbafSJunchao Zhang } 300219fbbafSJunchao Zhang 301219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 302219fbbafSJunchao Zhang { 303219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 304219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 305219fbbafSJunchao Zhang for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 306219fbbafSJunchao Zhang } 307219fbbafSJunchao Zhang 308219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 309219fbbafSJunchao Zhang { 310219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 311219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 312219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 313219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 314cbc6b225SStefano Zampini PetscScalar *Aa,*Ba = NULL; 315219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 316219fbbafSJunchao Zhang const PetscScalar *v1 = v; 317219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 318219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 319219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 320219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 321219fbbafSJunchao Zhang PetscMemType memtype; 322219fbbafSJunchao Zhang 323219fbbafSJunchao Zhang PetscFunctionBegin; 324219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 325cbc6b225SStefano Zampini PetscMPIInt size; 326cbc6b225SStefano Zampini 3279566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 3289566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 329219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3309566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 3319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 332219fbbafSJunchao Zhang } 333219fbbafSJunchao Zhang 334219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 3369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar))); 337cbc6b225SStefano Zampini if (size > 1) { 3389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 3399566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar))); 340cbc6b225SStefano Zampini } 341219fbbafSJunchao Zhang } else { 3429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 3439566063dSJacob Faibussowitsch if (size > 1) PetscCall(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 344219fbbafSJunchao Zhang } 345219fbbafSJunchao Zhang 346219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 347cbc6b225SStefano Zampini if (mpiaij->sendlen) { 348219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 3499566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 350cbc6b225SStefano Zampini } 351219fbbafSJunchao Zhang 352219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 3539566063dSJacob Faibussowitsch if (size > 1) PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 354219fbbafSJunchao Zhang /* Add local entries to A and B */ 355cbc6b225SStefano Zampini if (Annz1) { 356219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 3579566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 358cbc6b225SStefano Zampini } 359cbc6b225SStefano Zampini if (Bnnz1) { 360219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 3619566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 362cbc6b225SStefano Zampini } 3639566063dSJacob Faibussowitsch if (size > 1) PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 364219fbbafSJunchao Zhang 365219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 366cbc6b225SStefano Zampini if (Annz2) { 367219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 3689566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 369cbc6b225SStefano Zampini } 370cbc6b225SStefano Zampini if (Bnnz2) { 371219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 3729566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError()); 373cbc6b225SStefano Zampini } 374219fbbafSJunchao Zhang 375219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 3779566063dSJacob Faibussowitsch if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 378219fbbafSJunchao Zhang } else { 3799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 3809566063dSJacob Faibussowitsch if (size > 1) PetscCall(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 381219fbbafSJunchao Zhang } 3829566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void*)v1)); 383219fbbafSJunchao Zhang } else { 3849566063dSJacob Faibussowitsch PetscCall(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 385219fbbafSJunchao Zhang } 386cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 387219fbbafSJunchao Zhang PetscFunctionReturn(0); 388219fbbafSJunchao Zhang } 389219fbbafSJunchao Zhang 390ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 391ed502f03SStefano Zampini { 392ed502f03SStefano Zampini Mat Ad,Ao; 393ed502f03SStefano Zampini const PetscInt *cmap; 394ed502f03SStefano Zampini 395ed502f03SStefano Zampini PetscFunctionBegin; 3969566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 3979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 398ed502f03SStefano Zampini if (glob) { 399ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 400ed502f03SStefano Zampini 4019566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 4029566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao,NULL,&on)); 4039566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A,&cst,NULL)); 4049566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn+on,&gidx)); 405ed502f03SStefano Zampini for (i = 0; i<dn; i++) gidx[i] = cst + i; 406ed502f03SStefano Zampini for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 4079566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 408ed502f03SStefano Zampini } 409ed502f03SStefano Zampini PetscFunctionReturn(0); 410ed502f03SStefano Zampini } 411ed502f03SStefano Zampini 4129ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 4139ae82921SPaul Mullowney { 414bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 415bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 4169ae82921SPaul Mullowney PetscInt i; 4179ae82921SPaul Mullowney 4189ae82921SPaul Mullowney PetscFunctionBegin; 4199566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap)); 4209566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap)); 4217e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 4229ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 423*08401ef6SPierre Jolivet PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4249ae82921SPaul Mullowney } 4259ae82921SPaul Mullowney } 4267e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4279ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 428*08401ef6SPierre Jolivet PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4299ae82921SPaul Mullowney } 4309ae82921SPaul Mullowney } 4316a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4329566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&b->colmap)); 4336a29ce69SStefano Zampini #else 4349566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap)); 4356a29ce69SStefano Zampini #endif 4369566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray)); 4379566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec)); 4389566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx)); 4396a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4409566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B)); 4416a29ce69SStefano Zampini if (!b->A) { 4429566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->A)); 4439566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 4449566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 4456a29ce69SStefano Zampini } 4466a29ce69SStefano Zampini if (!b->B) { 4476a29ce69SStefano Zampini PetscMPIInt size; 4489566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 4499566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&b->B)); 4509566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 4519566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 4529ae82921SPaul Mullowney } 4539566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A,MATSEQAIJCUSPARSE)); 4549566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B,MATSEQAIJCUSPARSE)); 4559566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A,B->boundtocpu)); 4569566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B,B->boundtocpu)); 4579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 4589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 4599566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 4609566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 4619ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4629ae82921SPaul Mullowney PetscFunctionReturn(0); 4639ae82921SPaul Mullowney } 464e057df02SPaul Mullowney 4659ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4669ae82921SPaul Mullowney { 4679ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4689ae82921SPaul Mullowney 4699ae82921SPaul Mullowney PetscFunctionBegin; 4709566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4719566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A,xx,yy)); 4729566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4739566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 4749ae82921SPaul Mullowney PetscFunctionReturn(0); 4759ae82921SPaul Mullowney } 476ca45077fSPaul Mullowney 4773fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4783fa6b06aSMark Adams { 4793fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4803fa6b06aSMark Adams 4813fa6b06aSMark Adams PetscFunctionBegin; 4829566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A)); 4839566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B)); 4843fa6b06aSMark Adams PetscFunctionReturn(0); 4853fa6b06aSMark Adams } 486042217e8SBarry Smith 487fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 488fdc842d1SBarry Smith { 489fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 490fdc842d1SBarry Smith 491fdc842d1SBarry Smith PetscFunctionBegin; 4929566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4939566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A,xx,yy,zz)); 4949566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4959566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 496fdc842d1SBarry Smith PetscFunctionReturn(0); 497fdc842d1SBarry Smith } 498fdc842d1SBarry Smith 499ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 500ca45077fSPaul Mullowney { 501ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 502ca45077fSPaul Mullowney 503ca45077fSPaul Mullowney PetscFunctionBegin; 5049566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 5059566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A,xx,yy)); 5069566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 5079566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 508ca45077fSPaul Mullowney PetscFunctionReturn(0); 509ca45077fSPaul Mullowney } 5109ae82921SPaul Mullowney 511e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 512ca45077fSPaul Mullowney { 513ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 514bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 515e057df02SPaul Mullowney 516ca45077fSPaul Mullowney PetscFunctionBegin; 517e057df02SPaul Mullowney switch (op) { 518e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 519e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 520045c96e1SPaul Mullowney break; 521e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 522e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 523045c96e1SPaul Mullowney break; 524e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 525e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 526e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 527045c96e1SPaul Mullowney break; 528e057df02SPaul Mullowney default: 52998921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 530045c96e1SPaul Mullowney } 531ca45077fSPaul Mullowney PetscFunctionReturn(0); 532ca45077fSPaul Mullowney } 533e057df02SPaul Mullowney 5344416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 535e057df02SPaul Mullowney { 536e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 537e057df02SPaul Mullowney PetscBool flg; 538a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 539a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5405fd66863SKarl Rupp 541e057df02SPaul Mullowney PetscFunctionBegin; 5429566063dSJacob Faibussowitsch PetscCall(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options")); 543e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 5449566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 5455f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5469566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 5479566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5485f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 5499566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 5509566063dSJacob Faibussowitsch PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5515f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5529566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 553e057df02SPaul Mullowney } 5549566063dSJacob Faibussowitsch PetscCall(PetscOptionsTail()); 555e057df02SPaul Mullowney PetscFunctionReturn(0); 556e057df02SPaul Mullowney } 557e057df02SPaul Mullowney 55834d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 55934d6c7a5SJose E. Roman { 5603fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 561042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 562042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 563042217e8SBarry Smith 56434d6c7a5SJose E. Roman PetscFunctionBegin; 5659566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 5669566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec,VECSEQCUDA)); 567042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 568042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 569042217e8SBarry Smith 5709566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 5719566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 5729566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 5739566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 57499551766SMark Adams if (h_mat->allocated_indices) { 5759566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 5769566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 57799551766SMark Adams if (h_mat->offdiag.j) { 5789566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 5799566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 58099551766SMark Adams } 58199551766SMark Adams } 5829566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 5839566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 584042217e8SBarry Smith cusp->deviceMat = NULL; 5853fa6b06aSMark Adams } 58634d6c7a5SJose E. Roman PetscFunctionReturn(0); 58734d6c7a5SJose E. Roman } 58834d6c7a5SJose E. Roman 589bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 590bbf3fe20SPaul Mullowney { 5913fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 5923fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 593bbf3fe20SPaul Mullowney 594bbf3fe20SPaul Mullowney PetscFunctionBegin; 59528b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 5963fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 597042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 598042217e8SBarry Smith 5999566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Have device matrix\n")); 6009566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 6019566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 6029566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->colmap)); 60399551766SMark Adams if (h_mat->allocated_indices) { 6049566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.i)); 6059566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->diag.j)); 60699551766SMark Adams if (h_mat->offdiag.j) { 6079566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.i)); 6089566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(h_mat->offdiag.j)); 60999551766SMark Adams } 61099551766SMark Adams } 6119566063dSJacob Faibussowitsch PetscCallCUDA(cudaFree(d_mat)); 6129566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 6133fa6b06aSMark Adams } 614cbc6b225SStefano Zampini /* Free COO */ 6159566063dSJacob Faibussowitsch PetscCall(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 616acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct); 6179566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 6189566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 6199566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 6209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 6219566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 6229566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 6239566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 624bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 625bbf3fe20SPaul Mullowney } 626ca45077fSPaul Mullowney 6273338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6289ae82921SPaul Mullowney { 629bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 6303338378cSStefano Zampini Mat A; 6319ae82921SPaul Mullowney 6329ae82921SPaul Mullowney PetscFunctionBegin; 6339566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6349566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 6359566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 6363338378cSStefano Zampini A = *newmat; 6376f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6389566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 6399566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 64034136279SStefano Zampini 641bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 6429566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJCUSPARSE)); 6439566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJCUSPARSE)); 6449566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQCUDA)); 645d98d7c49SStefano Zampini 6463338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 647acbf8a88SJunchao Zhang PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE); 6483338378cSStefano Zampini } 6492205254eSKarl Rupp 65034d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 651bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 652fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 653bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 654bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 655bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6563fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6574e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 6582205254eSKarl Rupp 6599566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 6609566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6639566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6649566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 665ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 667ae48a8d0SStefano Zampini #endif 6689ae82921SPaul Mullowney PetscFunctionReturn(0); 6699ae82921SPaul Mullowney } 6709ae82921SPaul Mullowney 6713338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 6723338378cSStefano Zampini { 6733338378cSStefano Zampini PetscFunctionBegin; 6749566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6759566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 6769566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 6773338378cSStefano Zampini PetscFunctionReturn(0); 6783338378cSStefano Zampini } 6793338378cSStefano Zampini 6803f9c0db1SPaul Mullowney /*@ 6813f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 682e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 6833f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 684e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 685e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 686e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 6879ae82921SPaul Mullowney 688d083f849SBarry Smith Collective 689e057df02SPaul Mullowney 690e057df02SPaul Mullowney Input Parameters: 691e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 692e057df02SPaul Mullowney . m - number of rows 693e057df02SPaul Mullowney . n - number of columns 694e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 695e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 6960298fd71SBarry Smith (possibly different for each row) or NULL 697e057df02SPaul Mullowney 698e057df02SPaul Mullowney Output Parameter: 699e057df02SPaul Mullowney . A - the matrix 700e057df02SPaul Mullowney 701e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 702e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 703e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 704e057df02SPaul Mullowney 705e057df02SPaul Mullowney Notes: 706e057df02SPaul Mullowney If nnz is given then nz is ignored 707e057df02SPaul Mullowney 708e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 709e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 710e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 711e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 712e057df02SPaul Mullowney 713e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7140298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 715e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 716e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 717e057df02SPaul Mullowney 718e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 719e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 720e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 721e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 722e057df02SPaul Mullowney 723e057df02SPaul Mullowney Level: intermediate 724e057df02SPaul Mullowney 725e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 726e057df02SPaul Mullowney @*/ 7279ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7289ae82921SPaul Mullowney { 7299ae82921SPaul Mullowney PetscMPIInt size; 7309ae82921SPaul Mullowney 7319ae82921SPaul Mullowney PetscFunctionBegin; 7329566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 7339566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,M,N)); 7349566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm,&size)); 7359ae82921SPaul Mullowney if (size > 1) { 7369566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATMPIAIJCUSPARSE)); 7379566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 7389ae82921SPaul Mullowney } else { 7399566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJCUSPARSE)); 7409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 7419ae82921SPaul Mullowney } 7429ae82921SPaul Mullowney PetscFunctionReturn(0); 7439ae82921SPaul Mullowney } 7449ae82921SPaul Mullowney 7453ca39a21SBarry Smith /*MC 7466bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 747e057df02SPaul Mullowney 7482692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 7492692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 7502692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 7519ae82921SPaul Mullowney 7529ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 7539ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 7549ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 7559ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7569ae82921SPaul Mullowney the above preallocation routines for simplicity. 7579ae82921SPaul Mullowney 7589ae82921SPaul Mullowney Options Database Keys: 759e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 7608468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7618468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7628468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7639ae82921SPaul Mullowney 7649ae82921SPaul Mullowney Level: beginner 7659ae82921SPaul Mullowney 7666bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 7676bb69460SJunchao Zhang M*/ 7686bb69460SJunchao Zhang 7696bb69460SJunchao Zhang /*MC 7706bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 7716bb69460SJunchao Zhang 7726bb69460SJunchao Zhang Level: beginner 7736bb69460SJunchao Zhang 7746bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 7759ae82921SPaul Mullowney M*/ 7763fa6b06aSMark Adams 777042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 778042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 7793fa6b06aSMark Adams { 780042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 781042217e8SBarry Smith PetscMPIInt size; 782042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 783042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 78499551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 785042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 786042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 7873fa6b06aSMark Adams 7883fa6b06aSMark Adams PetscFunctionBegin; 78928b400f6SJacob Faibussowitsch PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 790042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 7913fa6b06aSMark Adams *B = NULL; 7923fa6b06aSMark Adams PetscFunctionReturn(0); 7933fa6b06aSMark Adams } 7949566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 79599551766SMark Adams // get jaca 7963fa6b06aSMark Adams if (size == 1) { 797042217e8SBarry Smith PetscBool isseqaij; 798042217e8SBarry Smith 7999566063dSJacob Faibussowitsch PetscCall(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 800042217e8SBarry Smith if (isseqaij) { 8013fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 80228b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 803042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 804042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 8059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(A)); 8063fa6b06aSMark Adams } else { 8073fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 80828b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 809042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8103fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 811042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 812042217e8SBarry Smith d_mat = spptr->deviceMat; 8139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8143fa6b06aSMark Adams } 815042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 816042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 81728b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 818042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 819042217e8SBarry Smith bi = NULL; 820042217e8SBarry Smith bj = NULL; 821042217e8SBarry Smith ba = NULL; 822042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 823042217e8SBarry Smith } else { 824042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 82528b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 826042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 82799551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 828042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8293fa6b06aSMark Adams 830*08401ef6SPierre Jolivet PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 831042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 832042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 833*08401ef6SPierre Jolivet PetscCheck(cusparsestructA->format==MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 834042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 8359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 837042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 838042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 83928b400f6SJacob Faibussowitsch PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 84028b400f6SJacob Faibussowitsch PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 841042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 842042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8433fa6b06aSMark Adams if (jacb->compressedrow.use) { 844042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 845042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 846042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 8473fa6b06aSMark Adams } 848042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 849042217e8SBarry Smith } else { 850042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 851042217e8SBarry Smith } 852042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 853042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 854042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 855042217e8SBarry Smith d_mat = spptr->deviceMat; 856042217e8SBarry Smith } 8573fa6b06aSMark Adams if (jaca->compressedrow.use) { 858042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 859042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 860042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 8613fa6b06aSMark Adams } 862042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8633fa6b06aSMark Adams } else { 864042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8653fa6b06aSMark Adams } 866042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 867042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 868042217e8SBarry Smith 869042217e8SBarry Smith if (!d_mat) { 870042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 871042217e8SBarry Smith 872042217e8SBarry Smith // create and populate strucy on host and copy on device 8739566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Create device matrix\n")); 8749566063dSJacob Faibussowitsch PetscCall(PetscNew(&h_mat)); 8759566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 876042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 877042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 87899551766SMark Adams PetscInt *colmap; 879042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 880042217e8SBarry Smith 881*08401ef6SPierre Jolivet PetscCheck(!n || aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 882042217e8SBarry Smith 8839566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(N+1,&colmap)); 884042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 885365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 886365b711fSMark Adams { // have to make a long version of these 88799551766SMark Adams int *h_bi32, *h_bj32; 88899551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 8899566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 8909566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 89199551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 8929566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 89399551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 89499551766SMark Adams 8959566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 8969566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 8979566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 8989566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 89999551766SMark Adams 90099551766SMark Adams h_mat->offdiag.i = d_bi64; 90199551766SMark Adams h_mat->offdiag.j = d_bj64; 90299551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 90399551766SMark Adams 9049566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 905365b711fSMark Adams } 906365b711fSMark Adams #else 90799551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 90899551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 90999551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 910365b711fSMark Adams #endif 911042217e8SBarry Smith h_mat->offdiag.a = ba; 912042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 913042217e8SBarry Smith 9149566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 9159566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 9169566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 917042217e8SBarry Smith } 918042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 919042217e8SBarry Smith h_mat->rend = A->rmap->rend; 920042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 921042217e8SBarry Smith h_mat->cend = A->cmap->rend; 92249b994a9SMark Adams h_mat->M = A->cmap->N; 923365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 924365b711fSMark Adams { 925*08401ef6SPierre Jolivet PetscCheck(sizeof(PetscInt) == 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 92699551766SMark Adams int *h_ai32, *h_aj32; 92799551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 9289566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 9299566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 93099551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 9319566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 93299551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 93399551766SMark Adams 9349566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 9359566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 9369566063dSJacob Faibussowitsch PetscCallCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 9379566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 93899551766SMark Adams 93999551766SMark Adams h_mat->diag.i = d_ai64; 94099551766SMark Adams h_mat->diag.j = d_aj64; 94199551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 94299551766SMark Adams 9439566063dSJacob Faibussowitsch PetscCall(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 944365b711fSMark Adams } 945365b711fSMark Adams #else 94699551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 94799551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 94899551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 949365b711fSMark Adams #endif 950042217e8SBarry Smith h_mat->diag.a = aa; 951042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 952042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 953042217e8SBarry Smith // copy pointers and metadata to device 9549566063dSJacob Faibussowitsch PetscCallCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 9559566063dSJacob Faibussowitsch PetscCall(PetscFree(h_mat)); 956042217e8SBarry Smith } else { 9579566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Reusing device matrix\n")); 958042217e8SBarry Smith } 959042217e8SBarry Smith *B = d_mat; 960042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9613fa6b06aSMark Adams PetscFunctionReturn(0); 9623fa6b06aSMark Adams } 963