1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 97e8381f9SStefano Zampini #include <thrust/advance.h> 104eb5378fSStefano Zampini #include <petscsf.h> 117e8381f9SStefano Zampini 127e8381f9SStefano Zampini struct VecCUDAEquals 137e8381f9SStefano Zampini { 147e8381f9SStefano Zampini template <typename Tuple> 157e8381f9SStefano Zampini __host__ __device__ 167e8381f9SStefano Zampini void operator()(Tuple t) 177e8381f9SStefano Zampini { 187e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 197e8381f9SStefano Zampini } 207e8381f9SStefano Zampini }; 217e8381f9SStefano Zampini 227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 237e8381f9SStefano Zampini { 247e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 257e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 267e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 277e8381f9SStefano Zampini PetscErrorCode ierr; 287e8381f9SStefano Zampini cudaError_t cerr; 297e8381f9SStefano Zampini 307e8381f9SStefano Zampini PetscFunctionBegin; 31e61fc153SStefano Zampini if (cusp->coo_p && v) { 3208391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3308391a17SStefano Zampini THRUSTARRAY *w = NULL; 3408391a17SStefano Zampini 3508391a17SStefano Zampini if (isCudaMem(v)) { 3608391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3708391a17SStefano Zampini } else { 38e61fc153SStefano Zampini w = new THRUSTARRAY(n); 39e61fc153SStefano Zampini w->assign(v,v+n); 4008391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4108391a17SStefano Zampini d_v = w->data(); 4208391a17SStefano Zampini } 4308391a17SStefano Zampini 4408391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 457e8381f9SStefano Zampini cusp->coo_pw->begin())); 4608391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 477e8381f9SStefano Zampini cusp->coo_pw->end())); 4808391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 497e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5008391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 5108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52e61fc153SStefano Zampini delete w; 537e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 547e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } else { 567e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 577e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini } 597e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 607e8381f9SStefano Zampini A->num_ass++; 617e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 627e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 634eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 647e8381f9SStefano Zampini PetscFunctionReturn(0); 657e8381f9SStefano Zampini } 667e8381f9SStefano Zampini 677e8381f9SStefano Zampini template <typename Tuple> 687e8381f9SStefano Zampini struct IsNotOffDiagT 697e8381f9SStefano Zampini { 707e8381f9SStefano Zampini PetscInt _cstart,_cend; 717e8381f9SStefano Zampini 727e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 737e8381f9SStefano Zampini __host__ __device__ 747e8381f9SStefano Zampini inline bool operator()(Tuple t) 757e8381f9SStefano Zampini { 767e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 777e8381f9SStefano Zampini } 787e8381f9SStefano Zampini }; 797e8381f9SStefano Zampini 807e8381f9SStefano Zampini struct IsOffDiag 817e8381f9SStefano Zampini { 827e8381f9SStefano Zampini PetscInt _cstart,_cend; 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 857e8381f9SStefano Zampini __host__ __device__ 867e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 877e8381f9SStefano Zampini { 887e8381f9SStefano Zampini return c < _cstart || c >= _cend; 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 927e8381f9SStefano Zampini struct GlobToLoc 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _start; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return c - _start; 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 1047e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1077e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1087e8381f9SStefano Zampini PetscErrorCode ierr; 1097e8381f9SStefano Zampini PetscInt *jj; 1107e8381f9SStefano Zampini size_t noff = 0; 1117e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini cudaError_t cerr; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini PetscFunctionBegin; 1177e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1187e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1197e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1207e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1217e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1227e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1237e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1247e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1257e8381f9SStefano Zampini 1267e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1277e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1287e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1297e8381f9SStefano Zampini delete cusp->coo_p; 1307e8381f9SStefano Zampini delete cusp->coo_pw; 1317e8381f9SStefano Zampini cusp->coo_p = NULL; 1327e8381f9SStefano Zampini cusp->coo_pw = NULL; 13308391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1347e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1357e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1367e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1377e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1387e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1397e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1407e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1417e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1427e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1437e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1447e8381f9SStefano Zampini } 1457e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1467e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1477e8381f9SStefano Zampini 1487e8381f9SStefano Zampini /* from global to local */ 1497e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1507e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 15108391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 15208391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1537e8381f9SStefano Zampini 1547e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1557e8381f9SStefano Zampini ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 1567e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1577e8381f9SStefano Zampini auto o_j = d_j.begin(); 15808391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1597e8381f9SStefano Zampini thrust::advance(o_j,cusp->coo_nd); 1607e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 1617e8381f9SStefano Zampini auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 16208391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16308391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1647e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 1657e8381f9SStefano Zampini ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 1667e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1677e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1687e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1697e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 1707e8381f9SStefano Zampini ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 1717e8381f9SStefano Zampini if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 1727e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1737e8381f9SStefano Zampini 1747e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1787e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1797e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1807e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1817e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1827e8381f9SStefano Zampini 1837e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1847e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1877e8381f9SStefano Zampini 1887e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1907e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1917e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 192*a0e72f99SJunchao Zhang /* 1937e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1947e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 195*a0e72f99SJunchao Zhang */ 1967e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1977e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1987e8381f9SStefano Zampini B->nonzerostate++; 1997e8381f9SStefano Zampini 2007e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2017e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2027e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 2037e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2047e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2057e8381f9SStefano Zampini PetscFunctionReturn(0); 2067e8381f9SStefano Zampini } 2079ae82921SPaul Mullowney 208ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 209ed502f03SStefano Zampini { 210ed502f03SStefano Zampini Mat Ad,Ao; 211ed502f03SStefano Zampini const PetscInt *cmap; 212ed502f03SStefano Zampini PetscErrorCode ierr; 213ed502f03SStefano Zampini 214ed502f03SStefano Zampini PetscFunctionBegin; 215ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 216ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 217ed502f03SStefano Zampini if (glob) { 218ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 219ed502f03SStefano Zampini 220ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 221ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 222ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 223ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 224ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 225ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 226ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 227ed502f03SStefano Zampini } 228ed502f03SStefano Zampini PetscFunctionReturn(0); 229ed502f03SStefano Zampini } 230ed502f03SStefano Zampini 2319ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 2329ae82921SPaul Mullowney { 233bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 234bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 2359ae82921SPaul Mullowney PetscErrorCode ierr; 2369ae82921SPaul Mullowney PetscInt i; 2379ae82921SPaul Mullowney 2389ae82921SPaul Mullowney PetscFunctionBegin; 2399ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2409ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2417e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2429ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2439ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 2449ae82921SPaul Mullowney } 2459ae82921SPaul Mullowney } 2467e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2479ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2489ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 2499ae82921SPaul Mullowney } 2509ae82921SPaul Mullowney } 2516a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 2526a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 2536a29ce69SStefano Zampini #else 2546a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 2556a29ce69SStefano Zampini #endif 2566a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 2576a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 2586a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 2596a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2606a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 2616a29ce69SStefano Zampini if (!b->A) { 2629ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 2639ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2643bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2656a29ce69SStefano Zampini } 2666a29ce69SStefano Zampini if (!b->B) { 2676a29ce69SStefano Zampini PetscMPIInt size; 26855b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 2699ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 2706a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 2713bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2729ae82921SPaul Mullowney } 273d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 274d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2756a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2766a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2779ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2789ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 279e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 280e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 281b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 282b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 283*a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 284b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 285b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 286*a0e72f99SJunchao Zhang */ 2879ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2889ae82921SPaul Mullowney PetscFunctionReturn(0); 2899ae82921SPaul Mullowney } 290e057df02SPaul Mullowney 291e589036eSStefano Zampini /*@ 292e589036eSStefano Zampini MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 293e589036eSStefano Zampini 294e589036eSStefano Zampini Not collective 295e589036eSStefano Zampini 296e589036eSStefano Zampini Input Parameters: 297e589036eSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 298e589036eSStefano Zampini - gen - the boolean flag 299e589036eSStefano Zampini 300e589036eSStefano Zampini Level: intermediate 301e589036eSStefano Zampini 302e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 303e589036eSStefano Zampini @*/ 304e589036eSStefano Zampini PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 305e589036eSStefano Zampini { 306e589036eSStefano Zampini PetscErrorCode ierr; 307e589036eSStefano Zampini PetscBool ismpiaij; 308e589036eSStefano Zampini 309e589036eSStefano Zampini PetscFunctionBegin; 310e589036eSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 311e589036eSStefano Zampini MatCheckPreallocated(A,1); 312e589036eSStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 313e589036eSStefano Zampini if (ismpiaij) { 314e589036eSStefano Zampini Mat A_d,A_o; 315e589036eSStefano Zampini 316e589036eSStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 317e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 318e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 319e589036eSStefano Zampini } else { 320e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 321e589036eSStefano Zampini } 322e589036eSStefano Zampini PetscFunctionReturn(0); 323e589036eSStefano Zampini } 324e589036eSStefano Zampini 3259ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 3269ae82921SPaul Mullowney { 3279ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 3289ae82921SPaul Mullowney PetscErrorCode ierr; 3299ae82921SPaul Mullowney PetscInt nt; 3309ae82921SPaul Mullowney 3319ae82921SPaul Mullowney PetscFunctionBegin; 3329ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 3339ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 3349ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3354d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 3369ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3379ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 3389ae82921SPaul Mullowney PetscFunctionReturn(0); 3399ae82921SPaul Mullowney } 340ca45077fSPaul Mullowney 3413fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 3423fa6b06aSMark Adams { 3433fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 3443fa6b06aSMark Adams PetscErrorCode ierr; 3453fa6b06aSMark Adams 3463fa6b06aSMark Adams PetscFunctionBegin; 3473fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 3483fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 3493fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 3503fa6b06aSMark Adams if (d_mat) { 3513fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 3523fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 3533fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 3543fa6b06aSMark Adams cudaError_t err; 3553fa6b06aSMark Adams PetscScalar *vals; 3563fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 3573fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3583fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 3593fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 3603fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 3613fa6b06aSMark Adams } 3623fa6b06aSMark Adams } 3633fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 3643fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 3653fa6b06aSMark Adams 3663fa6b06aSMark Adams PetscFunctionReturn(0); 3673fa6b06aSMark Adams } 368fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 369fdc842d1SBarry Smith { 370fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 371fdc842d1SBarry Smith PetscErrorCode ierr; 372fdc842d1SBarry Smith PetscInt nt; 373fdc842d1SBarry Smith 374fdc842d1SBarry Smith PetscFunctionBegin; 375fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 376fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 377fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3784d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 379fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 380fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 381fdc842d1SBarry Smith PetscFunctionReturn(0); 382fdc842d1SBarry Smith } 383fdc842d1SBarry Smith 384ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 385ca45077fSPaul Mullowney { 386ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 387ca45077fSPaul Mullowney PetscErrorCode ierr; 388ca45077fSPaul Mullowney PetscInt nt; 389ca45077fSPaul Mullowney 390ca45077fSPaul Mullowney PetscFunctionBegin; 391ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 392ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 3939b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 394ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 3959b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 3969b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 397ca45077fSPaul Mullowney PetscFunctionReturn(0); 398ca45077fSPaul Mullowney } 3999ae82921SPaul Mullowney 400e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 401ca45077fSPaul Mullowney { 402ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 403bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 404e057df02SPaul Mullowney 405ca45077fSPaul Mullowney PetscFunctionBegin; 406e057df02SPaul Mullowney switch (op) { 407e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 408e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 409045c96e1SPaul Mullowney break; 410e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 411e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 412045c96e1SPaul Mullowney break; 413e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 414e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 415e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 416045c96e1SPaul Mullowney break; 417e057df02SPaul Mullowney default: 418e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 419045c96e1SPaul Mullowney } 420ca45077fSPaul Mullowney PetscFunctionReturn(0); 421ca45077fSPaul Mullowney } 422e057df02SPaul Mullowney 4234416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 424e057df02SPaul Mullowney { 425e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 426e057df02SPaul Mullowney PetscErrorCode ierr; 427e057df02SPaul Mullowney PetscBool flg; 428a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 429a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 4305fd66863SKarl Rupp 431e057df02SPaul Mullowney PetscFunctionBegin; 432e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 433e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 434e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 435a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 436e057df02SPaul Mullowney if (flg) { 437e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 438e057df02SPaul Mullowney } 439e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 440a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 441e057df02SPaul Mullowney if (flg) { 442e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 443e057df02SPaul Mullowney } 444e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 445a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 446e057df02SPaul Mullowney if (flg) { 447e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 448e057df02SPaul Mullowney } 449e057df02SPaul Mullowney } 4500af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 451e057df02SPaul Mullowney PetscFunctionReturn(0); 452e057df02SPaul Mullowney } 453e057df02SPaul Mullowney 45434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 45534d6c7a5SJose E. Roman { 45634d6c7a5SJose E. Roman PetscErrorCode ierr; 4573fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 4583fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 4593fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 46034d6c7a5SJose E. Roman PetscFunctionBegin; 46134d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 46234d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 46334d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 46434d6c7a5SJose E. Roman } 465a587d139SMark if (d_mat) { 4663fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 4673fa6b06aSMark Adams } 4683fa6b06aSMark Adams 46934d6c7a5SJose E. Roman PetscFunctionReturn(0); 47034d6c7a5SJose E. Roman } 47134d6c7a5SJose E. Roman 472bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 473bbf3fe20SPaul Mullowney { 474bbf3fe20SPaul Mullowney PetscErrorCode ierr; 4753fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4763fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 477b06137fdSPaul Mullowney cudaError_t err; 478b06137fdSPaul Mullowney cusparseStatus_t stat; 479bbf3fe20SPaul Mullowney 480bbf3fe20SPaul Mullowney PetscFunctionBegin; 481d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 4823fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 4833fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 4843fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 4853fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 4863fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 4873fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 4883fa6b06aSMark Adams if (jaca->compressedrow.use) { 4893fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 4903fa6b06aSMark Adams } 4913fa6b06aSMark Adams if (jacb->compressedrow.use) { 4923fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 4933fa6b06aSMark Adams } 4943fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 4953fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 4963fa6b06aSMark Adams } 497bbf3fe20SPaul Mullowney try { 4983fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 4993fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 50057d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 501*a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 50217403302SKarl Rupp if (cusparseStruct->stream) { 503c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 50417403302SKarl Rupp } 505*a0e72f99SJunchao Zhang */ 5067e8381f9SStefano Zampini delete cusparseStruct->coo_p; 5077e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 508bbf3fe20SPaul Mullowney delete cusparseStruct; 509bbf3fe20SPaul Mullowney } catch(char *ex) { 510bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 511bbf3fe20SPaul Mullowney } 5123338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 513ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 5147e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 5157e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 5163338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 517bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 518bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 519bbf3fe20SPaul Mullowney } 520ca45077fSPaul Mullowney 5213338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 5229ae82921SPaul Mullowney { 5239ae82921SPaul Mullowney PetscErrorCode ierr; 524bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 525bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 526b06137fdSPaul Mullowney cusparseStatus_t stat; 5273338378cSStefano Zampini Mat A; 5289ae82921SPaul Mullowney 5299ae82921SPaul Mullowney PetscFunctionBegin; 5303338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 5313338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 5323338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 5333338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 5343338378cSStefano Zampini } 5353338378cSStefano Zampini A = *newmat; 5366f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 53734136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 53834136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 53934136279SStefano Zampini 540bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 541d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 542d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 543d98d7c49SStefano Zampini if (a->lvec) { 544d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 545d98d7c49SStefano Zampini } 546d98d7c49SStefano Zampini 5473338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 548bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 5492205254eSKarl Rupp 550bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 551e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 552e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 5537e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 5547e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 555*a0e72f99SJunchao Zhang cusparseStruct->stream = 0; /* We should not need cusparseStruct->stream */ 55657d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 557*a0e72f99SJunchao Zhang stat = cusparseSetStream(cusparseStruct->handle,PetscDefaultCudaStream);CHKERRCUSPARSE(stat); 5583fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 5593338378cSStefano Zampini } 5602205254eSKarl Rupp 56134d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 562bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 563fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 564bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 565bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 566bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 5673fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 5684e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 5692205254eSKarl Rupp 570bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 571ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 5723338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 573bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 5747e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 5757e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 5769ae82921SPaul Mullowney PetscFunctionReturn(0); 5779ae82921SPaul Mullowney } 5789ae82921SPaul Mullowney 5793338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 5803338378cSStefano Zampini { 5813338378cSStefano Zampini PetscErrorCode ierr; 5823338378cSStefano Zampini 5833338378cSStefano Zampini PetscFunctionBegin; 58405035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 5853338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 5863338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 5873338378cSStefano Zampini PetscFunctionReturn(0); 5883338378cSStefano Zampini } 5893338378cSStefano Zampini 5903f9c0db1SPaul Mullowney /*@ 5913f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 592e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 5933f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 594e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 595e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 596e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 5979ae82921SPaul Mullowney 598d083f849SBarry Smith Collective 599e057df02SPaul Mullowney 600e057df02SPaul Mullowney Input Parameters: 601e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 602e057df02SPaul Mullowney . m - number of rows 603e057df02SPaul Mullowney . n - number of columns 604e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 605e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 6060298fd71SBarry Smith (possibly different for each row) or NULL 607e057df02SPaul Mullowney 608e057df02SPaul Mullowney Output Parameter: 609e057df02SPaul Mullowney . A - the matrix 610e057df02SPaul Mullowney 611e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 612e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 613e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 614e057df02SPaul Mullowney 615e057df02SPaul Mullowney Notes: 616e057df02SPaul Mullowney If nnz is given then nz is ignored 617e057df02SPaul Mullowney 618e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 619e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 620e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 621e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 622e057df02SPaul Mullowney 623e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 6240298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 625e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 626e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 627e057df02SPaul Mullowney 628e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 629e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 630e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 631e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 632e057df02SPaul Mullowney 633e057df02SPaul Mullowney Level: intermediate 634e057df02SPaul Mullowney 635e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 636e057df02SPaul Mullowney @*/ 6379ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 6389ae82921SPaul Mullowney { 6399ae82921SPaul Mullowney PetscErrorCode ierr; 6409ae82921SPaul Mullowney PetscMPIInt size; 6419ae82921SPaul Mullowney 6429ae82921SPaul Mullowney PetscFunctionBegin; 6439ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 6449ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 645ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 6469ae82921SPaul Mullowney if (size > 1) { 6479ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 6489ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 6499ae82921SPaul Mullowney } else { 6509ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 6519ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 6529ae82921SPaul Mullowney } 6539ae82921SPaul Mullowney PetscFunctionReturn(0); 6549ae82921SPaul Mullowney } 6559ae82921SPaul Mullowney 6563ca39a21SBarry Smith /*MC 657e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 658e057df02SPaul Mullowney 6592692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 6602692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 6612692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 6629ae82921SPaul Mullowney 6639ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 6649ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 6659ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 6669ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 6679ae82921SPaul Mullowney the above preallocation routines for simplicity. 6689ae82921SPaul Mullowney 6699ae82921SPaul Mullowney Options Database Keys: 670e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 6718468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6728468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6738468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6749ae82921SPaul Mullowney 6759ae82921SPaul Mullowney Level: beginner 6769ae82921SPaul Mullowney 6778468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 6788468deeeSKarl Rupp M 6799ae82921SPaul Mullowney M*/ 6803fa6b06aSMark Adams 6813fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 6823fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 6833fa6b06aSMark Adams { 6849db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE) 6859db3cbf9SStefano Zampini SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 6869db3cbf9SStefano Zampini #else 6873fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 6883fa6b06aSMark Adams PetscMPIInt size,rank; 6893fa6b06aSMark Adams MPI_Comm comm; 6903fa6b06aSMark Adams PetscErrorCode ierr; 6913fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 6923fa6b06aSMark Adams PetscScalar *aa,*ba; 6933fa6b06aSMark Adams 6943fa6b06aSMark Adams PetscFunctionBegin; 6953fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 696ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 697ffc4695bSBarry Smith ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 6983fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 6993fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 7003fa6b06aSMark Adams if (size == 1) { 7013fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 7023fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 7033fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 7043fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 7053fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 7063fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 7076b78a27eSPierre Jolivet } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 7083fa6b06aSMark Adams } else { 7093fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 7103fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 7113fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 7123fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 7133fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 7143fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 7153fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 7163fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 7173fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 7183fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 7193fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 7203fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 7213fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 7223fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 7236b78a27eSPierre Jolivet } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 7243fa6b06aSMark Adams } 7253fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 7263fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 7273fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 7283fa6b06aSMark Adams } else { 7293fa6b06aSMark Adams *B = NULL; 7303fa6b06aSMark Adams PetscFunctionReturn(0); 7313fa6b06aSMark Adams } 7323fa6b06aSMark Adams // act like MatSetValues because not called on host 7333fa6b06aSMark Adams if (A->assembled) { 7343fa6b06aSMark Adams if (A->was_assembled) { 7353fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 7363fa6b06aSMark Adams } 7373fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 7383fa6b06aSMark Adams } else { 7393fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 7403fa6b06aSMark Adams } 7413fa6b06aSMark Adams if (!*p_d_mat) { 7423fa6b06aSMark Adams cudaError_t err; 7433fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 7443fa6b06aSMark Adams Mat_SeqAIJ *jaca; 7453fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 7463fa6b06aSMark Adams // create and copy 7473fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 7483fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 7493fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 7503fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 7513fa6b06aSMark Adams if (size == 1) { 7523fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 7533fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 7543fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 7553fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 7563fa6b06aSMark Adams h_mat.offdiag.a = NULL; 7573fa6b06aSMark Adams } else { 7583fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 7593fa6b06aSMark Adams Mat_SeqAIJ *jacb; 7603fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 7613fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 7623fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 7633fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 7643fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 7653fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 7663fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 7673fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 7683fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 7693fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 7703fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 7713fa6b06aSMark Adams { 7723fa6b06aSMark Adams PetscInt ii; 7733fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 7743fa6b06aSMark Adams } 7753fa6b06aSMark Adams if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 7763fa6b06aSMark Adams // allocate B copy data 7773fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 7783fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 7793fa6b06aSMark Adams nnz = jacb->i[n]; 7803fa6b06aSMark Adams 7813fa6b06aSMark Adams if (jacb->compressedrow.use) { 7823fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 7833fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7846b78a27eSPierre Jolivet } else h_mat.offdiag.i = bi; 7853fa6b06aSMark Adams h_mat.offdiag.j = bj; 7863fa6b06aSMark Adams h_mat.offdiag.a = ba; 7873fa6b06aSMark Adams 7883fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 7893fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 7903fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 7913fa6b06aSMark Adams h_mat.offdiag.n = n; 7923fa6b06aSMark Adams } 7933fa6b06aSMark Adams // allocate A copy data 7943fa6b06aSMark Adams nnz = jaca->i[n]; 7953fa6b06aSMark Adams h_mat.diag.n = n; 7963fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 797ffc4695bSBarry Smith ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 7983fa6b06aSMark Adams if (jaca->compressedrow.use) { 7993fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 8003fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 8013fa6b06aSMark Adams } else { 8023fa6b06aSMark Adams h_mat.diag.i = ai; 8033fa6b06aSMark Adams } 8043fa6b06aSMark Adams h_mat.diag.j = aj; 8053fa6b06aSMark Adams h_mat.diag.a = aa; 8063fa6b06aSMark Adams // copy pointers and metdata to device 8073fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 8083fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 8093fa6b06aSMark Adams } else { 8103fa6b06aSMark Adams *B = *p_d_mat; 8113fa6b06aSMark Adams } 8123fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 8133fa6b06aSMark Adams PetscFunctionReturn(0); 8149db3cbf9SStefano Zampini #endif 8153fa6b06aSMark Adams } 816