1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 94eb5378fSStefano Zampini #include <petscsf.h> 107e8381f9SStefano Zampini 117e8381f9SStefano Zampini struct VecCUDAEquals 127e8381f9SStefano Zampini { 137e8381f9SStefano Zampini template <typename Tuple> 147e8381f9SStefano Zampini __host__ __device__ 157e8381f9SStefano Zampini void operator()(Tuple t) 167e8381f9SStefano Zampini { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 227e8381f9SStefano Zampini { 237e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 247e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 257e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 267e8381f9SStefano Zampini PetscErrorCode ierr; 277e8381f9SStefano Zampini 287e8381f9SStefano Zampini PetscFunctionBegin; 29e61fc153SStefano Zampini if (cusp->coo_p && v) { 3008391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3108391a17SStefano Zampini THRUSTARRAY *w = NULL; 3208391a17SStefano Zampini 3308391a17SStefano Zampini if (isCudaMem(v)) { 3408391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3508391a17SStefano Zampini } else { 36e61fc153SStefano Zampini w = new THRUSTARRAY(n); 37e61fc153SStefano Zampini w->assign(v,v+n); 3808391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3908391a17SStefano Zampini d_v = w->data(); 4008391a17SStefano Zampini } 4108391a17SStefano Zampini 4208391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 437e8381f9SStefano Zampini cusp->coo_pw->begin())); 4408391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 457e8381f9SStefano Zampini cusp->coo_pw->end())); 4608391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 477e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49e61fc153SStefano Zampini delete w; 507e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 517e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 527e8381f9SStefano Zampini } else { 537e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 547e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } 567e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 577e8381f9SStefano Zampini A->num_ass++; 587e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 597e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 604eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 617e8381f9SStefano Zampini PetscFunctionReturn(0); 627e8381f9SStefano Zampini } 637e8381f9SStefano Zampini 647e8381f9SStefano Zampini template <typename Tuple> 657e8381f9SStefano Zampini struct IsNotOffDiagT 667e8381f9SStefano Zampini { 677e8381f9SStefano Zampini PetscInt _cstart,_cend; 687e8381f9SStefano Zampini 697e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 707e8381f9SStefano Zampini __host__ __device__ 717e8381f9SStefano Zampini inline bool operator()(Tuple t) 727e8381f9SStefano Zampini { 737e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 747e8381f9SStefano Zampini } 757e8381f9SStefano Zampini }; 767e8381f9SStefano Zampini 777e8381f9SStefano Zampini struct IsOffDiag 787e8381f9SStefano Zampini { 797e8381f9SStefano Zampini PetscInt _cstart,_cend; 807e8381f9SStefano Zampini 817e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 827e8381f9SStefano Zampini __host__ __device__ 837e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 847e8381f9SStefano Zampini { 857e8381f9SStefano Zampini return c < _cstart || c >= _cend; 867e8381f9SStefano Zampini } 877e8381f9SStefano Zampini }; 887e8381f9SStefano Zampini 897e8381f9SStefano Zampini struct GlobToLoc 907e8381f9SStefano Zampini { 917e8381f9SStefano Zampini PetscInt _start; 927e8381f9SStefano Zampini 937e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 947e8381f9SStefano Zampini __host__ __device__ 957e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 967e8381f9SStefano Zampini { 977e8381f9SStefano Zampini return c - _start; 987e8381f9SStefano Zampini } 997e8381f9SStefano Zampini }; 1007e8381f9SStefano Zampini 1017e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1027e8381f9SStefano Zampini { 1037e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1047e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1057e8381f9SStefano Zampini PetscErrorCode ierr; 1067e8381f9SStefano Zampini PetscInt *jj; 1077e8381f9SStefano Zampini size_t noff = 0; 108ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1097e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1107e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1117e8381f9SStefano Zampini cudaError_t cerr; 1127e8381f9SStefano Zampini 1137e8381f9SStefano Zampini PetscFunctionBegin; 1147e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1157e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1167e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1177e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1187e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1197e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1227e8381f9SStefano Zampini 1237e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1247e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1257e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1267e8381f9SStefano Zampini delete cusp->coo_p; 1277e8381f9SStefano Zampini delete cusp->coo_pw; 1287e8381f9SStefano Zampini cusp->coo_p = NULL; 1297e8381f9SStefano Zampini cusp->coo_pw = NULL; 13008391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1317e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1327e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1337e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1347e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1357e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1367e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1377e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1387e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1397e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1407e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1417e8381f9SStefano Zampini } 1427e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1437e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1447e8381f9SStefano Zampini 1457e8381f9SStefano Zampini /* from global to local */ 1467e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1477e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 14808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1497e8381f9SStefano Zampini 1507e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 151ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1527e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1537e8381f9SStefano Zampini auto o_j = d_j.begin(); 15408391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1567e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 157ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 15808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1597e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 160ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1617e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1627e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1637e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 1667e8381f9SStefano Zampini if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 1677e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1687e8381f9SStefano Zampini 1697e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1707e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1777e8381f9SStefano Zampini 1787e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1797e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1807e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1817e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1827e8381f9SStefano Zampini 1837e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1847e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187a0e72f99SJunchao Zhang /* 1887e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190a0e72f99SJunchao Zhang */ 1917e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1927e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1937e8381f9SStefano Zampini B->nonzerostate++; 1947e8381f9SStefano Zampini 1957e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1967e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1977e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1987e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 1997e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2007e8381f9SStefano Zampini PetscFunctionReturn(0); 2017e8381f9SStefano Zampini } 2029ae82921SPaul Mullowney 203ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204ed502f03SStefano Zampini { 205ed502f03SStefano Zampini Mat Ad,Ao; 206ed502f03SStefano Zampini const PetscInt *cmap; 207ed502f03SStefano Zampini PetscErrorCode ierr; 208ed502f03SStefano Zampini 209ed502f03SStefano Zampini PetscFunctionBegin; 210ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212ed502f03SStefano Zampini if (glob) { 213ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 214ed502f03SStefano Zampini 215ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 220ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222ed502f03SStefano Zampini } 223ed502f03SStefano Zampini PetscFunctionReturn(0); 224ed502f03SStefano Zampini } 225ed502f03SStefano Zampini 2269ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 2279ae82921SPaul Mullowney { 228bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 2309ae82921SPaul Mullowney PetscErrorCode ierr; 2319ae82921SPaul Mullowney PetscInt i; 2329ae82921SPaul Mullowney 2339ae82921SPaul Mullowney PetscFunctionBegin; 2349ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2359ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2367e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2379ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2389ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 2399ae82921SPaul Mullowney } 2409ae82921SPaul Mullowney } 2417e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2429ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2439ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 2449ae82921SPaul Mullowney } 2459ae82921SPaul Mullowney } 2466a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 2476a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 2486a29ce69SStefano Zampini #else 2496a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 2506a29ce69SStefano Zampini #endif 2516a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 2526a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 2536a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 2546a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2556a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 2566a29ce69SStefano Zampini if (!b->A) { 2579ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 2589ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2593bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2606a29ce69SStefano Zampini } 2616a29ce69SStefano Zampini if (!b->B) { 2626a29ce69SStefano Zampini PetscMPIInt size; 26355b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 2649ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 2656a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 2663bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2679ae82921SPaul Mullowney } 268d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2706a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2716a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2729ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2739ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 279b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281a0e72f99SJunchao Zhang */ 2829ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2839ae82921SPaul Mullowney PetscFunctionReturn(0); 2849ae82921SPaul Mullowney } 285e057df02SPaul Mullowney 2869ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2879ae82921SPaul Mullowney { 2889ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 2899ae82921SPaul Mullowney PetscErrorCode ierr; 2909ae82921SPaul Mullowney PetscInt nt; 2919ae82921SPaul Mullowney 2929ae82921SPaul Mullowney PetscFunctionBegin; 2939ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 2949ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 29556258e06SRichard Tran Mills /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 29656258e06SRichard Tran Mills if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 2979ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2984d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 2999ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3009ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 3019ae82921SPaul Mullowney PetscFunctionReturn(0); 3029ae82921SPaul Mullowney } 303ca45077fSPaul Mullowney 3043fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 3053fa6b06aSMark Adams { 3063fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 3073fa6b06aSMark Adams PetscErrorCode ierr; 3083fa6b06aSMark Adams 3093fa6b06aSMark Adams PetscFunctionBegin; 3103fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 3113fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 3123fa6b06aSMark Adams PetscFunctionReturn(0); 3133fa6b06aSMark Adams } 314042217e8SBarry Smith 315fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 316fdc842d1SBarry Smith { 317fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 318fdc842d1SBarry Smith PetscErrorCode ierr; 319fdc842d1SBarry Smith PetscInt nt; 320fdc842d1SBarry Smith 321fdc842d1SBarry Smith PetscFunctionBegin; 322fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 323fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 324fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3254d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 326fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 327fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 328fdc842d1SBarry Smith PetscFunctionReturn(0); 329fdc842d1SBarry Smith } 330fdc842d1SBarry Smith 331ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 332ca45077fSPaul Mullowney { 333ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 334ca45077fSPaul Mullowney PetscErrorCode ierr; 335ca45077fSPaul Mullowney PetscInt nt; 336ca45077fSPaul Mullowney 337ca45077fSPaul Mullowney PetscFunctionBegin; 338ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 339ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 3409b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 341ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 3429b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 3439b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 344ca45077fSPaul Mullowney PetscFunctionReturn(0); 345ca45077fSPaul Mullowney } 3469ae82921SPaul Mullowney 347e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 348ca45077fSPaul Mullowney { 349ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 350bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 351e057df02SPaul Mullowney 352ca45077fSPaul Mullowney PetscFunctionBegin; 353e057df02SPaul Mullowney switch (op) { 354e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 355e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 356045c96e1SPaul Mullowney break; 357e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 358e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 359045c96e1SPaul Mullowney break; 360e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 361e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 362e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 363045c96e1SPaul Mullowney break; 364e057df02SPaul Mullowney default: 365e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 366045c96e1SPaul Mullowney } 367ca45077fSPaul Mullowney PetscFunctionReturn(0); 368ca45077fSPaul Mullowney } 369e057df02SPaul Mullowney 3704416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 371e057df02SPaul Mullowney { 372e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 373e057df02SPaul Mullowney PetscErrorCode ierr; 374e057df02SPaul Mullowney PetscBool flg; 375a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 376a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 3775fd66863SKarl Rupp 378e057df02SPaul Mullowney PetscFunctionBegin; 379e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 380e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 381e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 382a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 383e057df02SPaul Mullowney if (flg) { 384e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 385e057df02SPaul Mullowney } 386e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 387a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 388e057df02SPaul Mullowney if (flg) { 389e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 390e057df02SPaul Mullowney } 391e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 392a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 393e057df02SPaul Mullowney if (flg) { 394e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 395e057df02SPaul Mullowney } 396e057df02SPaul Mullowney } 3970af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 398e057df02SPaul Mullowney PetscFunctionReturn(0); 399e057df02SPaul Mullowney } 400e057df02SPaul Mullowney 40134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 40234d6c7a5SJose E. Roman { 40334d6c7a5SJose E. Roman PetscErrorCode ierr; 4043fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 405042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 406042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 407042217e8SBarry Smith 40834d6c7a5SJose E. Roman PetscFunctionBegin; 40934d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 410042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 411042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 412042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 413042217e8SBarry Smith cudaError_t cerr; 414042217e8SBarry Smith 415042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 416042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 417042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 418042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 41999551766SMark Adams if (h_mat->allocated_indices) { 42099551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 42199551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 42299551766SMark Adams if (h_mat->offdiag.j) { 42399551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 42499551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 42599551766SMark Adams } 42699551766SMark Adams } 427042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 428042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 429042217e8SBarry Smith cusp->deviceMat = NULL; 4303fa6b06aSMark Adams } 43134d6c7a5SJose E. Roman PetscFunctionReturn(0); 43234d6c7a5SJose E. Roman } 43334d6c7a5SJose E. Roman 434bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 435bbf3fe20SPaul Mullowney { 436bbf3fe20SPaul Mullowney PetscErrorCode ierr; 4373fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4383fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 439b06137fdSPaul Mullowney cusparseStatus_t stat; 440bbf3fe20SPaul Mullowney 441bbf3fe20SPaul Mullowney PetscFunctionBegin; 442d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 4433fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 444042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 445042217e8SBarry Smith cudaError_t cerr; 446042217e8SBarry Smith 4473fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 448042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 449042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 450042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 45199551766SMark Adams if (h_mat->allocated_indices) { 45299551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 45399551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 45499551766SMark Adams if (h_mat->offdiag.j) { 45599551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 45699551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 45799551766SMark Adams } 45899551766SMark Adams } 459042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 460042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 4613fa6b06aSMark Adams } 462bbf3fe20SPaul Mullowney try { 4633fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 4643fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 46557d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 466a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 46717403302SKarl Rupp if (cusparseStruct->stream) { 468042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 46917403302SKarl Rupp } 470a0e72f99SJunchao Zhang */ 4717e8381f9SStefano Zampini delete cusparseStruct->coo_p; 4727e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 473bbf3fe20SPaul Mullowney delete cusparseStruct; 474bbf3fe20SPaul Mullowney } catch(char *ex) { 475bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 476bbf3fe20SPaul Mullowney } 4773338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 478ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 4797e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 4807e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 4813338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 482ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 483bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 484bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 485bbf3fe20SPaul Mullowney } 486ca45077fSPaul Mullowney 4873338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 4889ae82921SPaul Mullowney { 4899ae82921SPaul Mullowney PetscErrorCode ierr; 490bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 491bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 492b06137fdSPaul Mullowney cusparseStatus_t stat; 4933338378cSStefano Zampini Mat A; 4949ae82921SPaul Mullowney 4959ae82921SPaul Mullowney PetscFunctionBegin; 4963338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4973338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 4983338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 4993338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 5003338378cSStefano Zampini } 5013338378cSStefano Zampini A = *newmat; 5026f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 50334136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 50434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 50534136279SStefano Zampini 506bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 507d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 508d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 509d98d7c49SStefano Zampini if (a->lvec) { 510d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 511d98d7c49SStefano Zampini } 512d98d7c49SStefano Zampini 5133338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 514bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 5152205254eSKarl Rupp 516bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 517e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 518e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 5197e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 5207e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 521042217e8SBarry Smith cusparseStruct->stream = 0; 5223fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 523042217e8SBarry Smith stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 5243338378cSStefano Zampini } 5252205254eSKarl Rupp 52634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 527bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 528fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 529bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 530bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 531bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 5323fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 5334e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 5342205254eSKarl Rupp 535bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 536ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 5373338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 538bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 5397e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 5407e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 541ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 542ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 543ae48a8d0SStefano Zampini #endif 5449ae82921SPaul Mullowney PetscFunctionReturn(0); 5459ae82921SPaul Mullowney } 5469ae82921SPaul Mullowney 5473338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 5483338378cSStefano Zampini { 5493338378cSStefano Zampini PetscErrorCode ierr; 5503338378cSStefano Zampini 5513338378cSStefano Zampini PetscFunctionBegin; 552a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 5533338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 5543338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 5553338378cSStefano Zampini PetscFunctionReturn(0); 5563338378cSStefano Zampini } 5573338378cSStefano Zampini 5583f9c0db1SPaul Mullowney /*@ 5593f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 560e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 5613f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 562e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 563e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 564e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 5659ae82921SPaul Mullowney 566d083f849SBarry Smith Collective 567e057df02SPaul Mullowney 568e057df02SPaul Mullowney Input Parameters: 569e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 570e057df02SPaul Mullowney . m - number of rows 571e057df02SPaul Mullowney . n - number of columns 572e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 573e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 5740298fd71SBarry Smith (possibly different for each row) or NULL 575e057df02SPaul Mullowney 576e057df02SPaul Mullowney Output Parameter: 577e057df02SPaul Mullowney . A - the matrix 578e057df02SPaul Mullowney 579e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 580e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 581e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 582e057df02SPaul Mullowney 583e057df02SPaul Mullowney Notes: 584e057df02SPaul Mullowney If nnz is given then nz is ignored 585e057df02SPaul Mullowney 586e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 587e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 588e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 589e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 590e057df02SPaul Mullowney 591e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 5920298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 593e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 594e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 595e057df02SPaul Mullowney 596e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 597e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 598e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 599e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 600e057df02SPaul Mullowney 601e057df02SPaul Mullowney Level: intermediate 602e057df02SPaul Mullowney 603e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 604e057df02SPaul Mullowney @*/ 6059ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 6069ae82921SPaul Mullowney { 6079ae82921SPaul Mullowney PetscErrorCode ierr; 6089ae82921SPaul Mullowney PetscMPIInt size; 6099ae82921SPaul Mullowney 6109ae82921SPaul Mullowney PetscFunctionBegin; 6119ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 6129ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 613ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 6149ae82921SPaul Mullowney if (size > 1) { 6159ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 6169ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 6179ae82921SPaul Mullowney } else { 6189ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 6199ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 6209ae82921SPaul Mullowney } 6219ae82921SPaul Mullowney PetscFunctionReturn(0); 6229ae82921SPaul Mullowney } 6239ae82921SPaul Mullowney 6243ca39a21SBarry Smith /*MC 6256bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 626e057df02SPaul Mullowney 6272692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 6282692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 6292692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 6309ae82921SPaul Mullowney 6319ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 6329ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 6339ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 6349ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 6359ae82921SPaul Mullowney the above preallocation routines for simplicity. 6369ae82921SPaul Mullowney 6379ae82921SPaul Mullowney Options Database Keys: 638e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 6398468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6408468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6418468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6429ae82921SPaul Mullowney 6439ae82921SPaul Mullowney Level: beginner 6449ae82921SPaul Mullowney 6456bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 6466bb69460SJunchao Zhang M*/ 6476bb69460SJunchao Zhang 6486bb69460SJunchao Zhang /*MC 6496bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 6506bb69460SJunchao Zhang 6516bb69460SJunchao Zhang Level: beginner 6526bb69460SJunchao Zhang 6536bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 6549ae82921SPaul Mullowney M*/ 6553fa6b06aSMark Adams 656042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 657042217e8SBarry Smith 658042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 659042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 6603fa6b06aSMark Adams { 661042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 662042217e8SBarry Smith PetscMPIInt size; 6633fa6b06aSMark Adams PetscErrorCode ierr; 664042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 665042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 66699551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 667042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 668042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 6693fa6b06aSMark Adams 6703fa6b06aSMark Adams PetscFunctionBegin; 671042217e8SBarry Smith if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 672042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 6733fa6b06aSMark Adams *B = NULL; 6743fa6b06aSMark Adams PetscFunctionReturn(0); 6753fa6b06aSMark Adams } 676042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 67799551766SMark Adams // get jaca 6783fa6b06aSMark Adams if (size == 1) { 679042217e8SBarry Smith PetscBool isseqaij; 680042217e8SBarry Smith 681042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 682042217e8SBarry Smith if (isseqaij) { 6833fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 684042217e8SBarry Smith if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 685042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 686042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 687042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6883fa6b06aSMark Adams } else { 6893fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 690042217e8SBarry Smith if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 691042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 6923fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 693042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 694042217e8SBarry Smith d_mat = spptr->deviceMat; 695042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 6963fa6b06aSMark Adams } 697042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 698042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 699042217e8SBarry Smith if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 700042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 701042217e8SBarry Smith bi = NULL; 702042217e8SBarry Smith bj = NULL; 703042217e8SBarry Smith ba = NULL; 704042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 705042217e8SBarry Smith } else { 706042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 707042217e8SBarry Smith if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 708042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 70999551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 710042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 7113fa6b06aSMark Adams 712042217e8SBarry Smith if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 713042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 714042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 715042217e8SBarry Smith if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 716042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 717042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 718042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 719042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 720042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 721042217e8SBarry Smith if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 722042217e8SBarry Smith if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 723042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 724042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 7253fa6b06aSMark Adams if (jacb->compressedrow.use) { 726042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 727042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 728042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 7293fa6b06aSMark Adams } 730042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 731042217e8SBarry Smith } else { 732042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 733042217e8SBarry Smith } 734042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 735042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 736042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 737042217e8SBarry Smith d_mat = spptr->deviceMat; 738042217e8SBarry Smith } 7393fa6b06aSMark Adams if (jaca->compressedrow.use) { 740042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 741042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 742042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 7433fa6b06aSMark Adams } 744042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 7453fa6b06aSMark Adams } else { 746042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 7473fa6b06aSMark Adams } 748042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 749042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 750042217e8SBarry Smith 751042217e8SBarry Smith if (!d_mat) { 752042217e8SBarry Smith cudaError_t cerr; 753042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 754042217e8SBarry Smith 755042217e8SBarry Smith // create and populate strucy on host and copy on device 756042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 757042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 758042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 759042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 760042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 76199551766SMark Adams PetscInt *colmap; 762042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 763042217e8SBarry Smith 764b3c64f9dSJunchao Zhang if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 765042217e8SBarry Smith 766042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 767042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 768365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 769365b711fSMark Adams { // have to make a long version of these 77099551766SMark Adams int *h_bi32, *h_bj32; 77199551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 772365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 77399551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 77499551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 77599551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 77699551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 77799551766SMark Adams 77899551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 77999551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 78099551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 78199551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 78299551766SMark Adams 78399551766SMark Adams h_mat->offdiag.i = d_bi64; 78499551766SMark Adams h_mat->offdiag.j = d_bj64; 78599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 78699551766SMark Adams 787365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 788365b711fSMark Adams } 789365b711fSMark Adams #else 79099551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 79199551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 79299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 793365b711fSMark Adams #endif 794042217e8SBarry Smith h_mat->offdiag.a = ba; 795042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 796042217e8SBarry Smith 79799551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 79899551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 799042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 800042217e8SBarry Smith } 801042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 802042217e8SBarry Smith h_mat->rend = A->rmap->rend; 803042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 804042217e8SBarry Smith h_mat->cend = A->cmap->rend; 805*49b994a9SMark Adams h_mat->M = A->cmap->N; 806365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 807365b711fSMark Adams { 80899551766SMark Adams if (sizeof(PetscInt) != 8) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 80999551766SMark Adams int *h_ai32, *h_aj32; 81099551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 811365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 81299551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 81399551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 81499551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 81599551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 81699551766SMark Adams 81799551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 81899551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 81999551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 82099551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 82199551766SMark Adams 82299551766SMark Adams h_mat->diag.i = d_ai64; 82399551766SMark Adams h_mat->diag.j = d_aj64; 82499551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 82599551766SMark Adams 826365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 827365b711fSMark Adams } 828365b711fSMark Adams #else 82999551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 83099551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 83199551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 832365b711fSMark Adams #endif 833042217e8SBarry Smith h_mat->diag.a = aa; 834042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 835042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 836042217e8SBarry Smith // copy pointers and metadata to device 837042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 838042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 839042217e8SBarry Smith } else { 840042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 841042217e8SBarry Smith } 842042217e8SBarry Smith *B = d_mat; 843042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 8443fa6b06aSMark Adams PetscFunctionReturn(0); 8453fa6b06aSMark Adams } 846