1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 97e8381f9SStefano Zampini #include <thrust/advance.h> 107e8381f9SStefano Zampini 117e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetPreallocationCOO_SeqAIJCUSPARSE(Mat,PetscInt,const PetscInt[],const PetscInt[]); 127e8381f9SStefano Zampini PETSC_INTERN PetscErrorCode MatSetValuesCOO_SeqAIJCUSPARSE(Mat,const PetscScalar[],InsertMode); 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 247e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 257e8381f9SStefano Zampini { 267e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 277e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 287e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 297e8381f9SStefano Zampini PetscErrorCode ierr; 307e8381f9SStefano Zampini cudaError_t cerr; 317e8381f9SStefano Zampini 327e8381f9SStefano Zampini PetscFunctionBegin; 337e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 347e8381f9SStefano Zampini if (cusp->coo_p) { 357e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 367e8381f9SStefano Zampini THRUSTARRAY w(v,v+n); 377e8381f9SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->begin()), 387e8381f9SStefano Zampini cusp->coo_pw->begin())); 397e8381f9SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(w.begin(),cusp->coo_p->end()), 407e8381f9SStefano Zampini cusp->coo_pw->end())); 417e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 427e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 437e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 447e8381f9SStefano Zampini } else { 457e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 467e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 477e8381f9SStefano Zampini } 487e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 497e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 507e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 517e8381f9SStefano Zampini A->num_ass++; 527e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 537e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 547e8381f9SStefano Zampini A->offloadmask = PETSC_OFFLOAD_BOTH; 557e8381f9SStefano Zampini PetscFunctionReturn(0); 567e8381f9SStefano Zampini } 577e8381f9SStefano Zampini 587e8381f9SStefano Zampini template <typename Tuple> 597e8381f9SStefano Zampini struct IsNotOffDiagT 607e8381f9SStefano Zampini { 617e8381f9SStefano Zampini PetscInt _cstart,_cend; 627e8381f9SStefano Zampini 637e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 647e8381f9SStefano Zampini __host__ __device__ 657e8381f9SStefano Zampini inline bool operator()(Tuple t) 667e8381f9SStefano Zampini { 677e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 687e8381f9SStefano Zampini } 697e8381f9SStefano Zampini }; 707e8381f9SStefano Zampini 717e8381f9SStefano Zampini struct IsOffDiag 727e8381f9SStefano Zampini { 737e8381f9SStefano Zampini PetscInt _cstart,_cend; 747e8381f9SStefano Zampini 757e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 767e8381f9SStefano Zampini __host__ __device__ 777e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 787e8381f9SStefano Zampini { 797e8381f9SStefano Zampini return c < _cstart || c >= _cend; 807e8381f9SStefano Zampini } 817e8381f9SStefano Zampini }; 827e8381f9SStefano Zampini 837e8381f9SStefano Zampini struct GlobToLoc 847e8381f9SStefano Zampini { 857e8381f9SStefano Zampini PetscInt _start; 867e8381f9SStefano Zampini 877e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 887e8381f9SStefano Zampini __host__ __device__ 897e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 907e8381f9SStefano Zampini { 917e8381f9SStefano Zampini return c - _start; 927e8381f9SStefano Zampini } 937e8381f9SStefano Zampini }; 947e8381f9SStefano Zampini 957e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 967e8381f9SStefano Zampini { 977e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 987e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 997e8381f9SStefano Zampini PetscErrorCode ierr; 1007e8381f9SStefano Zampini PetscInt *jj; 1017e8381f9SStefano Zampini size_t noff = 0; 1027e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 1037e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1047e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1057e8381f9SStefano Zampini cudaError_t cerr; 1067e8381f9SStefano Zampini 1077e8381f9SStefano Zampini PetscFunctionBegin; 1087e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 1097e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1107e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1117e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1127e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1137e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1147e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1157e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1167e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1177e8381f9SStefano Zampini 1187e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1197e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1207e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1217e8381f9SStefano Zampini delete cusp->coo_p; 1227e8381f9SStefano Zampini delete cusp->coo_pw; 1237e8381f9SStefano Zampini cusp->coo_p = NULL; 1247e8381f9SStefano Zampini cusp->coo_pw = NULL; 1257e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1267e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1277e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1287e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1297e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1307e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1317e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1327e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1337e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1347e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1357e8381f9SStefano Zampini } 1367e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1377e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1387e8381f9SStefano Zampini 1397e8381f9SStefano Zampini /* from global to local */ 1407e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1417e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 1427e8381f9SStefano Zampini 1437e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1447e8381f9SStefano Zampini ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 1457e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1467e8381f9SStefano Zampini auto o_j = d_j.begin(); 1477e8381f9SStefano Zampini thrust::advance(o_j,cusp->coo_nd); 1487e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 1497e8381f9SStefano Zampini auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 1507e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 1517e8381f9SStefano Zampini ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 1527e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1537e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1547e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1557e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 1567e8381f9SStefano Zampini ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 1577e8381f9SStefano Zampini if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 1587e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1597e8381f9SStefano Zampini 1607e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1617e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1627e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1637e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1667e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1677e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1687e8381f9SStefano Zampini 1697e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1707e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1737e8381f9SStefano Zampini 1747e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 1787e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1797e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 1807e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1817e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1827e8381f9SStefano Zampini B->nonzerostate++; 1837e8381f9SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 1847e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 1857e8381f9SStefano Zampini 1867e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1887e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1897e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 1907e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 1917e8381f9SStefano Zampini PetscFunctionReturn(0); 1927e8381f9SStefano Zampini } 1939ae82921SPaul Mullowney 1949ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 1959ae82921SPaul Mullowney { 196bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 197bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 1989ae82921SPaul Mullowney PetscErrorCode ierr; 1999ae82921SPaul Mullowney PetscInt i; 2009ae82921SPaul Mullowney 2019ae82921SPaul Mullowney PetscFunctionBegin; 2029ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2039ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2047e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2059ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2069ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 2079ae82921SPaul Mullowney } 2089ae82921SPaul Mullowney } 2097e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2109ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2119ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 2129ae82921SPaul Mullowney } 2139ae82921SPaul Mullowney } 2149ae82921SPaul Mullowney if (!B->preallocated) { 215bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 2169ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 217b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2189ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2193bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2209ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 221b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2229ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 2233bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2249ae82921SPaul Mullowney } 225d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 226d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 227d98d7c49SStefano Zampini if (b->lvec) { 228d98d7c49SStefano Zampini ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 229d98d7c49SStefano Zampini } 2309ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2319ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 232e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 233e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 234b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 235b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 236b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 237b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 2382205254eSKarl Rupp 2399ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2409ae82921SPaul Mullowney PetscFunctionReturn(0); 2419ae82921SPaul Mullowney } 242e057df02SPaul Mullowney 2439ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2449ae82921SPaul Mullowney { 2459ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 2469ae82921SPaul Mullowney PetscErrorCode ierr; 2479ae82921SPaul Mullowney PetscInt nt; 2489ae82921SPaul Mullowney 2499ae82921SPaul Mullowney PetscFunctionBegin; 2509ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 2519ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 2529ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2534d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 2549ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2559ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 2569ae82921SPaul Mullowney PetscFunctionReturn(0); 2579ae82921SPaul Mullowney } 258ca45077fSPaul Mullowney 2593fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 2603fa6b06aSMark Adams { 2613fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 2623fa6b06aSMark Adams PetscErrorCode ierr; 2633fa6b06aSMark Adams 2643fa6b06aSMark Adams PetscFunctionBegin; 2653fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 2663fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 2673fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 2683fa6b06aSMark Adams if (d_mat) { 2693fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 2703fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 2713fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 2723fa6b06aSMark Adams cudaError_t err; 2733fa6b06aSMark Adams PetscScalar *vals; 2743fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 2753fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2763fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 2773fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2783fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 2793fa6b06aSMark Adams } 2803fa6b06aSMark Adams } 2813fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 2823fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 2833fa6b06aSMark Adams 2843fa6b06aSMark Adams PetscFunctionReturn(0); 2853fa6b06aSMark Adams } 286fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 287fdc842d1SBarry Smith { 288fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 289fdc842d1SBarry Smith PetscErrorCode ierr; 290fdc842d1SBarry Smith PetscInt nt; 291fdc842d1SBarry Smith 292fdc842d1SBarry Smith PetscFunctionBegin; 293fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 294fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 295fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2964d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 297fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 298fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 299fdc842d1SBarry Smith PetscFunctionReturn(0); 300fdc842d1SBarry Smith } 301fdc842d1SBarry Smith 302ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 303ca45077fSPaul Mullowney { 304ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 305ca45077fSPaul Mullowney PetscErrorCode ierr; 306ca45077fSPaul Mullowney PetscInt nt; 307ca45077fSPaul Mullowney 308ca45077fSPaul Mullowney PetscFunctionBegin; 309ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 310ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 3119b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 312ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 3139b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 3149b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 315ca45077fSPaul Mullowney PetscFunctionReturn(0); 316ca45077fSPaul Mullowney } 3179ae82921SPaul Mullowney 318e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 319ca45077fSPaul Mullowney { 320ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 321bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 322e057df02SPaul Mullowney 323ca45077fSPaul Mullowney PetscFunctionBegin; 324e057df02SPaul Mullowney switch (op) { 325e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 326e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 327045c96e1SPaul Mullowney break; 328e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 329e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 330045c96e1SPaul Mullowney break; 331e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 332e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 333e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 334045c96e1SPaul Mullowney break; 335e057df02SPaul Mullowney default: 336e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 337045c96e1SPaul Mullowney } 338ca45077fSPaul Mullowney PetscFunctionReturn(0); 339ca45077fSPaul Mullowney } 340e057df02SPaul Mullowney 3414416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 342e057df02SPaul Mullowney { 343e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 344e057df02SPaul Mullowney PetscErrorCode ierr; 345e057df02SPaul Mullowney PetscBool flg; 346a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 347a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 3485fd66863SKarl Rupp 349e057df02SPaul Mullowney PetscFunctionBegin; 350e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 351e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 352e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 353a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 354e057df02SPaul Mullowney if (flg) { 355e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 356e057df02SPaul Mullowney } 357e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 358a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 359e057df02SPaul Mullowney if (flg) { 360e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 361e057df02SPaul Mullowney } 362e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 363a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 364e057df02SPaul Mullowney if (flg) { 365e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 366e057df02SPaul Mullowney } 367e057df02SPaul Mullowney } 3680af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 369e057df02SPaul Mullowney PetscFunctionReturn(0); 370e057df02SPaul Mullowney } 371e057df02SPaul Mullowney 37234d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 37334d6c7a5SJose E. Roman { 37434d6c7a5SJose E. Roman PetscErrorCode ierr; 3753fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 3763fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 3773fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 37834d6c7a5SJose E. Roman PetscFunctionBegin; 37934d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 38034d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 38134d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 38234d6c7a5SJose E. Roman } 383*a587d139SMark if (d_mat) { 3843fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 3853fa6b06aSMark Adams } 3863fa6b06aSMark Adams 38734d6c7a5SJose E. Roman PetscFunctionReturn(0); 38834d6c7a5SJose E. Roman } 38934d6c7a5SJose E. Roman 390bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 391bbf3fe20SPaul Mullowney { 392bbf3fe20SPaul Mullowney PetscErrorCode ierr; 3933fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 3943fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 395b06137fdSPaul Mullowney cudaError_t err; 396b06137fdSPaul Mullowney cusparseStatus_t stat; 397bbf3fe20SPaul Mullowney 398bbf3fe20SPaul Mullowney PetscFunctionBegin; 399d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 4003fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 4013fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 4023fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 4033fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 4043fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 4053fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 4063fa6b06aSMark Adams if (jaca->compressedrow.use) { 4073fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 4083fa6b06aSMark Adams } 4093fa6b06aSMark Adams if (jacb->compressedrow.use) { 4103fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 4113fa6b06aSMark Adams } 4123fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 4133fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 4143fa6b06aSMark Adams } 415bbf3fe20SPaul Mullowney try { 4163fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 4173fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 41857d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 41917403302SKarl Rupp if (cusparseStruct->stream) { 420c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 42117403302SKarl Rupp } 4227e8381f9SStefano Zampini delete cusparseStruct->coo_p; 4237e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 424bbf3fe20SPaul Mullowney delete cusparseStruct; 425bbf3fe20SPaul Mullowney } catch(char *ex) { 426bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 427bbf3fe20SPaul Mullowney } 4283338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 4297e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 4307e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 4313338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 432bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 433bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 434bbf3fe20SPaul Mullowney } 435ca45077fSPaul Mullowney 4363338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 4379ae82921SPaul Mullowney { 4389ae82921SPaul Mullowney PetscErrorCode ierr; 439bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 440bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 441b06137fdSPaul Mullowney cusparseStatus_t stat; 4423338378cSStefano Zampini Mat A; 4439ae82921SPaul Mullowney 4449ae82921SPaul Mullowney PetscFunctionBegin; 4453338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4463338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 4473338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 4483338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 4493338378cSStefano Zampini } 4503338378cSStefano Zampini A = *newmat; 4513338378cSStefano Zampini 45234136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 45334136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 45434136279SStefano Zampini 455bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 456d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 457d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 458d98d7c49SStefano Zampini if (a->lvec) { 459d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 460d98d7c49SStefano Zampini } 461d98d7c49SStefano Zampini 4623338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 463bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 4642205254eSKarl Rupp 465bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 466e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 467e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 4687e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 4697e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 47017403302SKarl Rupp cusparseStruct->stream = 0; 47157d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 4723fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 4733338378cSStefano Zampini } 4742205254eSKarl Rupp 47534d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 476bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 477fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 478bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 479bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 480bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 4813fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 4822205254eSKarl Rupp 483bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 4843338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 485bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 4867e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 4877e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 4889ae82921SPaul Mullowney PetscFunctionReturn(0); 4899ae82921SPaul Mullowney } 4909ae82921SPaul Mullowney 4913338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 4923338378cSStefano Zampini { 4933338378cSStefano Zampini PetscErrorCode ierr; 4943338378cSStefano Zampini 4953338378cSStefano Zampini PetscFunctionBegin; 49605035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 4973338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 4983338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 4993338378cSStefano Zampini PetscFunctionReturn(0); 5003338378cSStefano Zampini } 5013338378cSStefano Zampini 5023f9c0db1SPaul Mullowney /*@ 5033f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 504e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 5053f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 506e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 507e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 508e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 5099ae82921SPaul Mullowney 510d083f849SBarry Smith Collective 511e057df02SPaul Mullowney 512e057df02SPaul Mullowney Input Parameters: 513e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 514e057df02SPaul Mullowney . m - number of rows 515e057df02SPaul Mullowney . n - number of columns 516e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 517e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 5180298fd71SBarry Smith (possibly different for each row) or NULL 519e057df02SPaul Mullowney 520e057df02SPaul Mullowney Output Parameter: 521e057df02SPaul Mullowney . A - the matrix 522e057df02SPaul Mullowney 523e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 524e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 525e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 526e057df02SPaul Mullowney 527e057df02SPaul Mullowney Notes: 528e057df02SPaul Mullowney If nnz is given then nz is ignored 529e057df02SPaul Mullowney 530e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 531e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 532e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 533e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 534e057df02SPaul Mullowney 535e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 5360298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 537e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 538e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 539e057df02SPaul Mullowney 540e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 541e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 542e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 543e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 544e057df02SPaul Mullowney 545e057df02SPaul Mullowney Level: intermediate 546e057df02SPaul Mullowney 547e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 548e057df02SPaul Mullowney @*/ 5499ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 5509ae82921SPaul Mullowney { 5519ae82921SPaul Mullowney PetscErrorCode ierr; 5529ae82921SPaul Mullowney PetscMPIInt size; 5539ae82921SPaul Mullowney 5549ae82921SPaul Mullowney PetscFunctionBegin; 5559ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 5569ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 5579ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 5589ae82921SPaul Mullowney if (size > 1) { 5599ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 5609ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 5619ae82921SPaul Mullowney } else { 5629ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 5639ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 5649ae82921SPaul Mullowney } 5659ae82921SPaul Mullowney PetscFunctionReturn(0); 5669ae82921SPaul Mullowney } 5679ae82921SPaul Mullowney 5683ca39a21SBarry Smith /*MC 569e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 570e057df02SPaul Mullowney 5712692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 5722692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 5732692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 5749ae82921SPaul Mullowney 5759ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 5769ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 5779ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 5789ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 5799ae82921SPaul Mullowney the above preallocation routines for simplicity. 5809ae82921SPaul Mullowney 5819ae82921SPaul Mullowney Options Database Keys: 582e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 5838468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 5848468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 5858468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 5869ae82921SPaul Mullowney 5879ae82921SPaul Mullowney Level: beginner 5889ae82921SPaul Mullowney 5898468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 5908468deeeSKarl Rupp M 5919ae82921SPaul Mullowney M*/ 5923fa6b06aSMark Adams 5933fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 5943fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 5953fa6b06aSMark Adams { 5969db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE) 5979db3cbf9SStefano Zampini SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 5989db3cbf9SStefano Zampini #else 5993fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 6003fa6b06aSMark Adams PetscMPIInt size,rank; 6013fa6b06aSMark Adams MPI_Comm comm; 6023fa6b06aSMark Adams PetscErrorCode ierr; 6033fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 6043fa6b06aSMark Adams PetscScalar *aa,*ba; 6053fa6b06aSMark Adams 6063fa6b06aSMark Adams PetscFunctionBegin; 6073fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 6083fa6b06aSMark Adams ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 6093fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 6103fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 6113fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 6123fa6b06aSMark Adams if (size == 1) { 6133fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 6143fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 6153fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 6163fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 6173fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 6183fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 6193fa6b06aSMark Adams } else { 6203fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 6213fa6b06aSMark Adams } 6223fa6b06aSMark Adams } else { 6233fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6243fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 6253fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 6263fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 6273fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 6283fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 6293fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 6303fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 6313fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 6323fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 6333fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 6343fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 6353fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 6363fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 6373fa6b06aSMark Adams if (rank==-1) { 6383fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 6393fa6b06aSMark Adams std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 6403fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 6413fa6b06aSMark Adams std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 6423fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->values->size(); i++) 6433fa6b06aSMark Adams std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 6443fa6b06aSMark Adams } 6453fa6b06aSMark Adams } else { 6463fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 6473fa6b06aSMark Adams } 6483fa6b06aSMark Adams } 6493fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 6503fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 6513fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 6523fa6b06aSMark Adams } else { 6533fa6b06aSMark Adams *B = NULL; 6543fa6b06aSMark Adams PetscFunctionReturn(0); 6553fa6b06aSMark Adams } 6563fa6b06aSMark Adams // act like MatSetValues because not called on host 6573fa6b06aSMark Adams if (A->assembled) { 6583fa6b06aSMark Adams if (A->was_assembled) { 6593fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 6603fa6b06aSMark Adams } 6613fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 6623fa6b06aSMark Adams } else { 6633fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 6643fa6b06aSMark Adams } 6653fa6b06aSMark Adams if (!*p_d_mat) { 6663fa6b06aSMark Adams cudaError_t err; 6673fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 6683fa6b06aSMark Adams Mat_SeqAIJ *jaca; 6693fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 6703fa6b06aSMark Adams // create and copy 6713fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 6723fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 6733fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 6743fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 6753fa6b06aSMark Adams if (size == 1) { 6763fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 6773fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 6783fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 6793fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 6803fa6b06aSMark Adams h_mat.offdiag.a = NULL; 6813fa6b06aSMark Adams } else { 6823fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6833fa6b06aSMark Adams Mat_SeqAIJ *jacb; 6843fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 6853fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 6863fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 6873fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 6883fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 6893fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 6903fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 6913fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 6923fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 6933fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 6943fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 6953fa6b06aSMark Adams { 6963fa6b06aSMark Adams PetscInt ii; 6973fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 6983fa6b06aSMark Adams } 6993fa6b06aSMark Adams if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 7003fa6b06aSMark Adams // allocate B copy data 7013fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 7023fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 7033fa6b06aSMark Adams nnz = jacb->i[n]; 7043fa6b06aSMark Adams 7053fa6b06aSMark Adams if (jacb->compressedrow.use) { 7063fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 7073fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7083fa6b06aSMark Adams } else { 7093fa6b06aSMark Adams h_mat.offdiag.i = bi; 7103fa6b06aSMark Adams } 7113fa6b06aSMark Adams h_mat.offdiag.j = bj; 7123fa6b06aSMark Adams h_mat.offdiag.a = ba; 7133fa6b06aSMark Adams 7143fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 7153fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7163fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 7173fa6b06aSMark Adams h_mat.offdiag.n = n; 7183fa6b06aSMark Adams } 7193fa6b06aSMark Adams // allocate A copy data 7203fa6b06aSMark Adams nnz = jaca->i[n]; 7213fa6b06aSMark Adams h_mat.diag.n = n; 7223fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 7233fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 7243fa6b06aSMark Adams if (jaca->compressedrow.use) { 7253fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 7263fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7273fa6b06aSMark Adams } else { 7283fa6b06aSMark Adams h_mat.diag.i = ai; 7293fa6b06aSMark Adams } 7303fa6b06aSMark Adams h_mat.diag.j = aj; 7313fa6b06aSMark Adams h_mat.diag.a = aa; 7323fa6b06aSMark Adams // copy pointers and metdata to device 7333fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7343fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 7353fa6b06aSMark Adams } else { 7363fa6b06aSMark Adams *B = *p_d_mat; 7373fa6b06aSMark Adams } 7383fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 7393fa6b06aSMark Adams PetscFunctionReturn(0); 7409db3cbf9SStefano Zampini #endif 7413fa6b06aSMark Adams } 742