1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24*219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 257e8381f9SStefano Zampini { 267e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 277e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 287e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 297e8381f9SStefano Zampini PetscErrorCode ierr; 307e8381f9SStefano Zampini 317e8381f9SStefano Zampini PetscFunctionBegin; 32e61fc153SStefano Zampini if (cusp->coo_p && v) { 3308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3408391a17SStefano Zampini THRUSTARRAY *w = NULL; 3508391a17SStefano Zampini 3608391a17SStefano Zampini if (isCudaMem(v)) { 3708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3808391a17SStefano Zampini } else { 39e61fc153SStefano Zampini w = new THRUSTARRAY(n); 40e61fc153SStefano Zampini w->assign(v,v+n); 4108391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4208391a17SStefano Zampini d_v = w->data(); 4308391a17SStefano Zampini } 4408391a17SStefano Zampini 4508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 467e8381f9SStefano Zampini cusp->coo_pw->begin())); 4708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 487e8381f9SStefano Zampini cusp->coo_pw->end())); 4908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52e61fc153SStefano Zampini delete w; 53*219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54*219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } else { 56*219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57*219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini } 597e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 607e8381f9SStefano Zampini A->num_ass++; 617e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 627e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 634eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 647e8381f9SStefano Zampini PetscFunctionReturn(0); 657e8381f9SStefano Zampini } 667e8381f9SStefano Zampini 677e8381f9SStefano Zampini template <typename Tuple> 687e8381f9SStefano Zampini struct IsNotOffDiagT 697e8381f9SStefano Zampini { 707e8381f9SStefano Zampini PetscInt _cstart,_cend; 717e8381f9SStefano Zampini 727e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 737e8381f9SStefano Zampini __host__ __device__ 747e8381f9SStefano Zampini inline bool operator()(Tuple t) 757e8381f9SStefano Zampini { 767e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 777e8381f9SStefano Zampini } 787e8381f9SStefano Zampini }; 797e8381f9SStefano Zampini 807e8381f9SStefano Zampini struct IsOffDiag 817e8381f9SStefano Zampini { 827e8381f9SStefano Zampini PetscInt _cstart,_cend; 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 857e8381f9SStefano Zampini __host__ __device__ 867e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 877e8381f9SStefano Zampini { 887e8381f9SStefano Zampini return c < _cstart || c >= _cend; 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 927e8381f9SStefano Zampini struct GlobToLoc 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _start; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return c - _start; 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 104*219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1077e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1087e8381f9SStefano Zampini PetscErrorCode ierr; 10982a78a4eSJed Brown PetscInt N,*jj; 1107e8381f9SStefano Zampini size_t noff = 0; 111ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini cudaError_t cerr; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini PetscFunctionBegin; 1177e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1187e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1197e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1227e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1237e8381f9SStefano Zampini 1247e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1257e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1267e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1277e8381f9SStefano Zampini delete cusp->coo_p; 1287e8381f9SStefano Zampini delete cusp->coo_pw; 1297e8381f9SStefano Zampini cusp->coo_p = NULL; 1307e8381f9SStefano Zampini cusp->coo_pw = NULL; 13108391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1327e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1337e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1347e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1357e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1367e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1377e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1387e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1397e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1407e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1417e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1427e8381f9SStefano Zampini } 1437e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1447e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1457e8381f9SStefano Zampini 1467e8381f9SStefano Zampini /* from global to local */ 1477e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1487e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 14908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507e8381f9SStefano Zampini 1517e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 152ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1537e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1547e8381f9SStefano Zampini auto o_j = d_j.begin(); 15508391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1577e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 158ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 15908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1607e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 161ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1627e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1637e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 16682a78a4eSJed Brown ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 1672c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1687e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1697e8381f9SStefano Zampini 1707e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1787e8381f9SStefano Zampini 1797e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 180*219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181*219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1827e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1837e8381f9SStefano Zampini 1847e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188a0e72f99SJunchao Zhang /* 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1907e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191a0e72f99SJunchao Zhang */ 1927e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1937e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1947e8381f9SStefano Zampini B->nonzerostate++; 1957e8381f9SStefano Zampini 1967e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1977e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1987e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1997e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2007e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2017e8381f9SStefano Zampini PetscFunctionReturn(0); 2027e8381f9SStefano Zampini } 2039ae82921SPaul Mullowney 204*219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205*219fbbafSJunchao Zhang { 206*219fbbafSJunchao Zhang PetscErrorCode ierr; 207*219fbbafSJunchao Zhang Mat newmat; 208*219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij; 209*219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 210*219fbbafSJunchao Zhang PetscInt coo_basic = 1; 211*219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212*219fbbafSJunchao Zhang PetscInt rstart,rend; 213*219fbbafSJunchao Zhang cudaError_t cerr; 214*219fbbafSJunchao Zhang 215*219fbbafSJunchao Zhang PetscFunctionBegin; 216*219fbbafSJunchao Zhang if (coo_i) { 217*219fbbafSJunchao Zhang ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218*219fbbafSJunchao Zhang ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219*219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 220*219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221*219fbbafSJunchao Zhang if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222*219fbbafSJunchao Zhang } 223*219fbbafSJunchao Zhang } 224*219fbbafSJunchao Zhang } 225*219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 226*219fbbafSJunchao Zhang ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227*219fbbafSJunchao Zhang 228*219fbbafSJunchao Zhang if (coo_basic) { 229*219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230*219fbbafSJunchao Zhang } else { 231*219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232*219fbbafSJunchao Zhang ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233*219fbbafSJunchao Zhang ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234*219fbbafSJunchao Zhang ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235*219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236*219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237*219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238*219fbbafSJunchao Zhang ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239*219fbbafSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240*219fbbafSJunchao Zhang ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 241*219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 242*219fbbafSJunchao Zhang mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 243*219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 244*219fbbafSJunchao Zhang 245*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 246*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 247*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 248*219fbbafSJunchao Zhang 249*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 250*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 251*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 252*219fbbafSJunchao Zhang 253*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 254*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 255*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 256*219fbbafSJunchao Zhang 257*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 258*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 259*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 260*219fbbafSJunchao Zhang 261*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 262*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 263*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 264*219fbbafSJunchao Zhang 265*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 266*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 267*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268*219fbbafSJunchao Zhang 269*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 271*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272*219fbbafSJunchao Zhang 273*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 275*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276*219fbbafSJunchao Zhang 277*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 279*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280*219fbbafSJunchao Zhang 281*219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282*219fbbafSJunchao Zhang } 283*219fbbafSJunchao Zhang PetscFunctionReturn(0); 284*219fbbafSJunchao Zhang } 285*219fbbafSJunchao Zhang 286*219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 287*219fbbafSJunchao Zhang { 288*219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 289*219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 290*219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 291*219fbbafSJunchao Zhang } 292*219fbbafSJunchao Zhang 293*219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 294*219fbbafSJunchao Zhang { 295*219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 296*219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 297*219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 298*219fbbafSJunchao Zhang } 299*219fbbafSJunchao Zhang 300*219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 301*219fbbafSJunchao Zhang { 302*219fbbafSJunchao Zhang PetscErrorCode ierr; 303*219fbbafSJunchao Zhang cudaError_t cerr; 304*219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 305*219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 306*219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 307*219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 308*219fbbafSJunchao Zhang PetscScalar *Aa,*Ba; 309*219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 310*219fbbafSJunchao Zhang const PetscScalar *v1 = v; 311*219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 312*219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 313*219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 314*219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 315*219fbbafSJunchao Zhang PetscMemType memtype; 316*219fbbafSJunchao Zhang 317*219fbbafSJunchao Zhang PetscFunctionBegin; 318*219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 319*219fbbafSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 320*219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 321*219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 322*219fbbafSJunchao Zhang if (v) {cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);} 323*219fbbafSJunchao Zhang else {cerr = cudaMemset((void*)v1,0,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);} 324*219fbbafSJunchao Zhang } 325*219fbbafSJunchao Zhang 326*219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 327*219fbbafSJunchao Zhang ierr = MatZeroEntries(A);CHKERRQ(ierr); 328*219fbbafSJunchao Zhang ierr = MatZeroEntries(B);CHKERRQ(ierr); 329*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 330*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 331*219fbbafSJunchao Zhang } else { 332*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 333*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 334*219fbbafSJunchao Zhang } 335*219fbbafSJunchao Zhang 336*219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 337*219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 338*219fbbafSJunchao Zhang 339*219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 340*219fbbafSJunchao Zhang ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 341*219fbbafSJunchao Zhang /* Add local entries to A and B */ 342*219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 343*219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 344*219fbbafSJunchao Zhang ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 345*219fbbafSJunchao Zhang 346*219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 347*219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 348*219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 349*219fbbafSJunchao Zhang 350*219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 351*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 352*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 353*219fbbafSJunchao Zhang } else { 354*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 355*219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 356*219fbbafSJunchao Zhang } 357*219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 358*219fbbafSJunchao Zhang } else { 359*219fbbafSJunchao Zhang ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 360*219fbbafSJunchao Zhang } 361*219fbbafSJunchao Zhang PetscFunctionReturn(0); 362*219fbbafSJunchao Zhang } 363*219fbbafSJunchao Zhang 364ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 365ed502f03SStefano Zampini { 366ed502f03SStefano Zampini Mat Ad,Ao; 367ed502f03SStefano Zampini const PetscInt *cmap; 368ed502f03SStefano Zampini PetscErrorCode ierr; 369ed502f03SStefano Zampini 370ed502f03SStefano Zampini PetscFunctionBegin; 371ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 372ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 373ed502f03SStefano Zampini if (glob) { 374ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 375ed502f03SStefano Zampini 376ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 377ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 378ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 379ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 380ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 381ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 382ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 383ed502f03SStefano Zampini } 384ed502f03SStefano Zampini PetscFunctionReturn(0); 385ed502f03SStefano Zampini } 386ed502f03SStefano Zampini 3879ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 3889ae82921SPaul Mullowney { 389bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 390bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 3919ae82921SPaul Mullowney PetscErrorCode ierr; 3929ae82921SPaul Mullowney PetscInt i; 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney PetscFunctionBegin; 3959ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 3969ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 3977e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 3989ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 3992c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4009ae82921SPaul Mullowney } 4019ae82921SPaul Mullowney } 4027e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4039ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4042c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4059ae82921SPaul Mullowney } 4069ae82921SPaul Mullowney } 4076a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4086a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 4096a29ce69SStefano Zampini #else 4106a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 4116a29ce69SStefano Zampini #endif 4126a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 4136a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 4146a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 4156a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4166a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 4176a29ce69SStefano Zampini if (!b->A) { 4189ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 4199ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 4203bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 4216a29ce69SStefano Zampini } 4226a29ce69SStefano Zampini if (!b->B) { 4236a29ce69SStefano Zampini PetscMPIInt size; 42455b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 4259ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 4266a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 4273bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 4289ae82921SPaul Mullowney } 429d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 430d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4316a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 4326a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 4339ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 4349ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 435e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 436e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 437b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 438b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 439a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 440b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 441b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 442a0e72f99SJunchao Zhang */ 4439ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4449ae82921SPaul Mullowney PetscFunctionReturn(0); 4459ae82921SPaul Mullowney } 446e057df02SPaul Mullowney 4479ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4489ae82921SPaul Mullowney { 4499ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4509ae82921SPaul Mullowney PetscErrorCode ierr; 4519ae82921SPaul Mullowney PetscInt nt; 4529ae82921SPaul Mullowney 4539ae82921SPaul Mullowney PetscFunctionBegin; 4549ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4552c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 45656258e06SRichard Tran Mills /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 45756258e06SRichard Tran Mills if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 4589ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4594d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 4609ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4619ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 4629ae82921SPaul Mullowney PetscFunctionReturn(0); 4639ae82921SPaul Mullowney } 464ca45077fSPaul Mullowney 4653fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4663fa6b06aSMark Adams { 4673fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4683fa6b06aSMark Adams PetscErrorCode ierr; 4693fa6b06aSMark Adams 4703fa6b06aSMark Adams PetscFunctionBegin; 4713fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 4723fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 4733fa6b06aSMark Adams PetscFunctionReturn(0); 4743fa6b06aSMark Adams } 475042217e8SBarry Smith 476fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 477fdc842d1SBarry Smith { 478fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 479fdc842d1SBarry Smith PetscErrorCode ierr; 480fdc842d1SBarry Smith PetscInt nt; 481fdc842d1SBarry Smith 482fdc842d1SBarry Smith PetscFunctionBegin; 483fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4842c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 485fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4864d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 487fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 488fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 489fdc842d1SBarry Smith PetscFunctionReturn(0); 490fdc842d1SBarry Smith } 491fdc842d1SBarry Smith 492ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 493ca45077fSPaul Mullowney { 494ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 495ca45077fSPaul Mullowney PetscErrorCode ierr; 496ca45077fSPaul Mullowney PetscInt nt; 497ca45077fSPaul Mullowney 498ca45077fSPaul Mullowney PetscFunctionBegin; 499ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 5002c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 5019b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 502ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 5039b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 5049b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 505ca45077fSPaul Mullowney PetscFunctionReturn(0); 506ca45077fSPaul Mullowney } 5079ae82921SPaul Mullowney 508e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 509ca45077fSPaul Mullowney { 510ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 511bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 512e057df02SPaul Mullowney 513ca45077fSPaul Mullowney PetscFunctionBegin; 514e057df02SPaul Mullowney switch (op) { 515e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 516e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 517045c96e1SPaul Mullowney break; 518e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 519e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 520045c96e1SPaul Mullowney break; 521e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 522e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 523e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 524045c96e1SPaul Mullowney break; 525e057df02SPaul Mullowney default: 52698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 527045c96e1SPaul Mullowney } 528ca45077fSPaul Mullowney PetscFunctionReturn(0); 529ca45077fSPaul Mullowney } 530e057df02SPaul Mullowney 5314416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 532e057df02SPaul Mullowney { 533e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 534e057df02SPaul Mullowney PetscErrorCode ierr; 535e057df02SPaul Mullowney PetscBool flg; 536a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 537a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5385fd66863SKarl Rupp 539e057df02SPaul Mullowney PetscFunctionBegin; 540e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 541e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 542e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 543a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 544e057df02SPaul Mullowney if (flg) { 545e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 546e057df02SPaul Mullowney } 547e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 548a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 549e057df02SPaul Mullowney if (flg) { 550e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 551e057df02SPaul Mullowney } 552e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 553a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 554e057df02SPaul Mullowney if (flg) { 555e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 556e057df02SPaul Mullowney } 557e057df02SPaul Mullowney } 5580af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 559e057df02SPaul Mullowney PetscFunctionReturn(0); 560e057df02SPaul Mullowney } 561e057df02SPaul Mullowney 56234d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 56334d6c7a5SJose E. Roman { 56434d6c7a5SJose E. Roman PetscErrorCode ierr; 5653fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 566042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 567042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 568042217e8SBarry Smith 56934d6c7a5SJose E. Roman PetscFunctionBegin; 57034d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 571042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 572042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 573042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 574042217e8SBarry Smith cudaError_t cerr; 575042217e8SBarry Smith 576042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 577042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 578042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 579042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 58099551766SMark Adams if (h_mat->allocated_indices) { 58199551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 58299551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 58399551766SMark Adams if (h_mat->offdiag.j) { 58499551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 58599551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 58699551766SMark Adams } 58799551766SMark Adams } 588042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 589042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 590042217e8SBarry Smith cusp->deviceMat = NULL; 5913fa6b06aSMark Adams } 59234d6c7a5SJose E. Roman PetscFunctionReturn(0); 59334d6c7a5SJose E. Roman } 59434d6c7a5SJose E. Roman 595bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 596bbf3fe20SPaul Mullowney { 597bbf3fe20SPaul Mullowney PetscErrorCode ierr; 598*219fbbafSJunchao Zhang cudaError_t cerr; 5993fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6003fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 601b06137fdSPaul Mullowney cusparseStatus_t stat; 602bbf3fe20SPaul Mullowney 603bbf3fe20SPaul Mullowney PetscFunctionBegin; 6042c71b3e2SJacob Faibussowitsch PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6053fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 606042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 607042217e8SBarry Smith 6083fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 609042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 610042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 611042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 61299551766SMark Adams if (h_mat->allocated_indices) { 61399551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 61499551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 61599551766SMark Adams if (h_mat->offdiag.j) { 61699551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 61799551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 61899551766SMark Adams } 61999551766SMark Adams } 620042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 621042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 6223fa6b06aSMark Adams } 623bbf3fe20SPaul Mullowney try { 6243fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 6253fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 62657d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 627a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 62817403302SKarl Rupp if (cusparseStruct->stream) { 629042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 63017403302SKarl Rupp } 631a0e72f99SJunchao Zhang */ 632*219fbbafSJunchao Zhang /* Extended COO */ 633*219fbbafSJunchao Zhang if (cusparseStruct->use_extended_coo) { 634*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 635*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 636*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 637*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 638*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 639*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 640*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 641*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 642*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 643*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 644*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 645*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 646*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 647*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 648*219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 649*219fbbafSJunchao Zhang } else { 6507e8381f9SStefano Zampini delete cusparseStruct->coo_p; 6517e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 652*219fbbafSJunchao Zhang } 653bbf3fe20SPaul Mullowney delete cusparseStruct; 654bbf3fe20SPaul Mullowney } catch(char *ex) { 65598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 656bbf3fe20SPaul Mullowney } 6573338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 658ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 6597e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 6607e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 6613338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 662ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 663bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 664bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 665bbf3fe20SPaul Mullowney } 666ca45077fSPaul Mullowney 6673338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6689ae82921SPaul Mullowney { 6699ae82921SPaul Mullowney PetscErrorCode ierr; 670bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 671b06137fdSPaul Mullowney cusparseStatus_t stat; 6723338378cSStefano Zampini Mat A; 6739ae82921SPaul Mullowney 6749ae82921SPaul Mullowney PetscFunctionBegin; 675*219fbbafSJunchao Zhang ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 6763338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 6773338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 6783338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 6793338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 6803338378cSStefano Zampini } 6813338378cSStefano Zampini A = *newmat; 6826f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 68334136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 68434136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 68534136279SStefano Zampini 686bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 687d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 688d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 689d98d7c49SStefano Zampini if (a->lvec) { 690d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 691d98d7c49SStefano Zampini } 692d98d7c49SStefano Zampini 6933338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 694*219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 695*219fbbafSJunchao Zhang stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 696*219fbbafSJunchao Zhang a->spptr = cusp; 6973338378cSStefano Zampini } 6982205254eSKarl Rupp 69934d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 700bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 701fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 702bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 703bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 704bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 7053fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 7064e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 7072205254eSKarl Rupp 708bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 709ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 7103338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 711bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 7127e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 7137e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 714ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 715ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 716ae48a8d0SStefano Zampini #endif 7179ae82921SPaul Mullowney PetscFunctionReturn(0); 7189ae82921SPaul Mullowney } 7199ae82921SPaul Mullowney 7203338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 7213338378cSStefano Zampini { 7223338378cSStefano Zampini PetscErrorCode ierr; 7233338378cSStefano Zampini 7243338378cSStefano Zampini PetscFunctionBegin; 725a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 7263338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 7273338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 7283338378cSStefano Zampini PetscFunctionReturn(0); 7293338378cSStefano Zampini } 7303338378cSStefano Zampini 7313f9c0db1SPaul Mullowney /*@ 7323f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 733e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 7343f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 735e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 736e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 737e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7389ae82921SPaul Mullowney 739d083f849SBarry Smith Collective 740e057df02SPaul Mullowney 741e057df02SPaul Mullowney Input Parameters: 742e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 743e057df02SPaul Mullowney . m - number of rows 744e057df02SPaul Mullowney . n - number of columns 745e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 746e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7470298fd71SBarry Smith (possibly different for each row) or NULL 748e057df02SPaul Mullowney 749e057df02SPaul Mullowney Output Parameter: 750e057df02SPaul Mullowney . A - the matrix 751e057df02SPaul Mullowney 752e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 753e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 754e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 755e057df02SPaul Mullowney 756e057df02SPaul Mullowney Notes: 757e057df02SPaul Mullowney If nnz is given then nz is ignored 758e057df02SPaul Mullowney 759e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 760e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 761e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 762e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 763e057df02SPaul Mullowney 764e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7650298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 766e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 767e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 768e057df02SPaul Mullowney 769e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 770e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 771e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 772e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 773e057df02SPaul Mullowney 774e057df02SPaul Mullowney Level: intermediate 775e057df02SPaul Mullowney 776e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 777e057df02SPaul Mullowney @*/ 7789ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7799ae82921SPaul Mullowney { 7809ae82921SPaul Mullowney PetscErrorCode ierr; 7819ae82921SPaul Mullowney PetscMPIInt size; 7829ae82921SPaul Mullowney 7839ae82921SPaul Mullowney PetscFunctionBegin; 7849ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7859ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 786ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 7879ae82921SPaul Mullowney if (size > 1) { 7889ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 7899ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 7909ae82921SPaul Mullowney } else { 7919ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7929ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 7939ae82921SPaul Mullowney } 7949ae82921SPaul Mullowney PetscFunctionReturn(0); 7959ae82921SPaul Mullowney } 7969ae82921SPaul Mullowney 7973ca39a21SBarry Smith /*MC 7986bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 799e057df02SPaul Mullowney 8002692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 8012692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 8022692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 8039ae82921SPaul Mullowney 8049ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 8059ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 8069ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 8079ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 8089ae82921SPaul Mullowney the above preallocation routines for simplicity. 8099ae82921SPaul Mullowney 8109ae82921SPaul Mullowney Options Database Keys: 811e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 8128468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8138468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8148468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8159ae82921SPaul Mullowney 8169ae82921SPaul Mullowney Level: beginner 8179ae82921SPaul Mullowney 8186bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 8196bb69460SJunchao Zhang M*/ 8206bb69460SJunchao Zhang 8216bb69460SJunchao Zhang /*MC 8226bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 8236bb69460SJunchao Zhang 8246bb69460SJunchao Zhang Level: beginner 8256bb69460SJunchao Zhang 8266bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 8279ae82921SPaul Mullowney M*/ 8283fa6b06aSMark Adams 829042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 830042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 8313fa6b06aSMark Adams { 832042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 833042217e8SBarry Smith PetscMPIInt size; 8343fa6b06aSMark Adams PetscErrorCode ierr; 835042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 836042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 83799551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 838042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 839042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 8403fa6b06aSMark Adams 8413fa6b06aSMark Adams PetscFunctionBegin; 8422c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 843042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8443fa6b06aSMark Adams *B = NULL; 8453fa6b06aSMark Adams PetscFunctionReturn(0); 8463fa6b06aSMark Adams } 847042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 84899551766SMark Adams // get jaca 8493fa6b06aSMark Adams if (size == 1) { 850042217e8SBarry Smith PetscBool isseqaij; 851042217e8SBarry Smith 852042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 853042217e8SBarry Smith if (isseqaij) { 8543fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 8552c71b3e2SJacob Faibussowitsch PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 856042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 857042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 858042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 8593fa6b06aSMark Adams } else { 8603fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8612c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 862042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8633fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 864042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 865042217e8SBarry Smith d_mat = spptr->deviceMat; 866042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 8673fa6b06aSMark Adams } 868042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 869042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 8702c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 871042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 872042217e8SBarry Smith bi = NULL; 873042217e8SBarry Smith bj = NULL; 874042217e8SBarry Smith ba = NULL; 875042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 876042217e8SBarry Smith } else { 877042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8782c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 879042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 88099551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 881042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8823fa6b06aSMark Adams 8832c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 884042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 885042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 8862c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 887042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 888042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 889042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 890042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 891042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 8922c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 8932c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 894042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 895042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8963fa6b06aSMark Adams if (jacb->compressedrow.use) { 897042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 898042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 899042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 9003fa6b06aSMark Adams } 901042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 902042217e8SBarry Smith } else { 903042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 904042217e8SBarry Smith } 905042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 906042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 907042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 908042217e8SBarry Smith d_mat = spptr->deviceMat; 909042217e8SBarry Smith } 9103fa6b06aSMark Adams if (jaca->compressedrow.use) { 911042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 912042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 913042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 9143fa6b06aSMark Adams } 915042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 9163fa6b06aSMark Adams } else { 917042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 9183fa6b06aSMark Adams } 919042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 920042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 921042217e8SBarry Smith 922042217e8SBarry Smith if (!d_mat) { 923042217e8SBarry Smith cudaError_t cerr; 924042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 925042217e8SBarry Smith 926042217e8SBarry Smith // create and populate strucy on host and copy on device 927042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 928042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 929042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 930042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 931042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 93299551766SMark Adams PetscInt *colmap; 933042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 934042217e8SBarry Smith 9352c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 936042217e8SBarry Smith 937042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 938042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 939365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 940365b711fSMark Adams { // have to make a long version of these 94199551766SMark Adams int *h_bi32, *h_bj32; 94299551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 943365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 94499551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94599551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 94699551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94799551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 94899551766SMark Adams 94999551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 95099551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95199551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 95299551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95399551766SMark Adams 95499551766SMark Adams h_mat->offdiag.i = d_bi64; 95599551766SMark Adams h_mat->offdiag.j = d_bj64; 95699551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 95799551766SMark Adams 958365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 959365b711fSMark Adams } 960365b711fSMark Adams #else 96199551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 96299551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 96399551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 964365b711fSMark Adams #endif 965042217e8SBarry Smith h_mat->offdiag.a = ba; 966042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 967042217e8SBarry Smith 96899551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 96999551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 970042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 971042217e8SBarry Smith } 972042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 973042217e8SBarry Smith h_mat->rend = A->rmap->rend; 974042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 975042217e8SBarry Smith h_mat->cend = A->cmap->rend; 97649b994a9SMark Adams h_mat->M = A->cmap->N; 977365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 978365b711fSMark Adams { 9792c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 98099551766SMark Adams int *h_ai32, *h_aj32; 98199551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 982365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 98399551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98499551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 98599551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98699551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 98799551766SMark Adams 98899551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 98999551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99099551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 99199551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99299551766SMark Adams 99399551766SMark Adams h_mat->diag.i = d_ai64; 99499551766SMark Adams h_mat->diag.j = d_aj64; 99599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 99699551766SMark Adams 997365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 998365b711fSMark Adams } 999365b711fSMark Adams #else 100099551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 100199551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 100299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 1003365b711fSMark Adams #endif 1004042217e8SBarry Smith h_mat->diag.a = aa; 1005042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 1006042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 1007042217e8SBarry Smith // copy pointers and metadata to device 1008042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1009042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 1010042217e8SBarry Smith } else { 1011042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1012042217e8SBarry Smith } 1013042217e8SBarry Smith *B = d_mat; 1014042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 10153fa6b06aSMark Adams PetscFunctionReturn(0); 10163fa6b06aSMark Adams } 1017