1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 257e8381f9SStefano Zampini { 267e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 277e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 287e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 297e8381f9SStefano Zampini PetscErrorCode ierr; 307e8381f9SStefano Zampini 317e8381f9SStefano Zampini PetscFunctionBegin; 32e61fc153SStefano Zampini if (cusp->coo_p && v) { 3308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3408391a17SStefano Zampini THRUSTARRAY *w = NULL; 3508391a17SStefano Zampini 3608391a17SStefano Zampini if (isCudaMem(v)) { 3708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3808391a17SStefano Zampini } else { 39e61fc153SStefano Zampini w = new THRUSTARRAY(n); 40e61fc153SStefano Zampini w->assign(v,v+n); 4108391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4208391a17SStefano Zampini d_v = w->data(); 4308391a17SStefano Zampini } 4408391a17SStefano Zampini 4508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 467e8381f9SStefano Zampini cusp->coo_pw->begin())); 4708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 487e8381f9SStefano Zampini cusp->coo_pw->end())); 4908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52e61fc153SStefano Zampini delete w; 53219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } else { 56219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini } 597e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 607e8381f9SStefano Zampini A->num_ass++; 617e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 627e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 634eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 647e8381f9SStefano Zampini PetscFunctionReturn(0); 657e8381f9SStefano Zampini } 667e8381f9SStefano Zampini 677e8381f9SStefano Zampini template <typename Tuple> 687e8381f9SStefano Zampini struct IsNotOffDiagT 697e8381f9SStefano Zampini { 707e8381f9SStefano Zampini PetscInt _cstart,_cend; 717e8381f9SStefano Zampini 727e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 737e8381f9SStefano Zampini __host__ __device__ 747e8381f9SStefano Zampini inline bool operator()(Tuple t) 757e8381f9SStefano Zampini { 767e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 777e8381f9SStefano Zampini } 787e8381f9SStefano Zampini }; 797e8381f9SStefano Zampini 807e8381f9SStefano Zampini struct IsOffDiag 817e8381f9SStefano Zampini { 827e8381f9SStefano Zampini PetscInt _cstart,_cend; 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 857e8381f9SStefano Zampini __host__ __device__ 867e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 877e8381f9SStefano Zampini { 887e8381f9SStefano Zampini return c < _cstart || c >= _cend; 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 927e8381f9SStefano Zampini struct GlobToLoc 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _start; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return c - _start; 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 104219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1077e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1087e8381f9SStefano Zampini PetscErrorCode ierr; 10982a78a4eSJed Brown PetscInt N,*jj; 1107e8381f9SStefano Zampini size_t noff = 0; 111ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini cudaError_t cerr; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini PetscFunctionBegin; 1177e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1187e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1197e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1227e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1237e8381f9SStefano Zampini 1247e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1257e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1267e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1277e8381f9SStefano Zampini delete cusp->coo_p; 1287e8381f9SStefano Zampini delete cusp->coo_pw; 1297e8381f9SStefano Zampini cusp->coo_p = NULL; 1307e8381f9SStefano Zampini cusp->coo_pw = NULL; 13108391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1327e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1337e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1347e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1357e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1367e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1377e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1387e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1397e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1407e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1417e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1427e8381f9SStefano Zampini } 1437e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1447e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1457e8381f9SStefano Zampini 1467e8381f9SStefano Zampini /* from global to local */ 1477e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1487e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 14908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507e8381f9SStefano Zampini 1517e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 152ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1537e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1547e8381f9SStefano Zampini auto o_j = d_j.begin(); 15508391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1577e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 158ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 15908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1607e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 161ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1627e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1637e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 16682a78a4eSJed Brown ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 1672c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1687e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1697e8381f9SStefano Zampini 1707e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1787e8381f9SStefano Zampini 1797e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 180219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1827e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1837e8381f9SStefano Zampini 1847e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188a0e72f99SJunchao Zhang /* 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1907e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191a0e72f99SJunchao Zhang */ 1927e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1937e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1947e8381f9SStefano Zampini B->nonzerostate++; 1957e8381f9SStefano Zampini 1967e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1977e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1987e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1997e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2007e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2017e8381f9SStefano Zampini PetscFunctionReturn(0); 2027e8381f9SStefano Zampini } 2039ae82921SPaul Mullowney 204219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205219fbbafSJunchao Zhang { 206219fbbafSJunchao Zhang PetscErrorCode ierr; 207219fbbafSJunchao Zhang Mat newmat; 208219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij; 209219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 210219fbbafSJunchao Zhang PetscInt coo_basic = 1; 211219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212219fbbafSJunchao Zhang PetscInt rstart,rend; 213219fbbafSJunchao Zhang cudaError_t cerr; 214219fbbafSJunchao Zhang 215219fbbafSJunchao Zhang PetscFunctionBegin; 216219fbbafSJunchao Zhang if (coo_i) { 217219fbbafSJunchao Zhang ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218219fbbafSJunchao Zhang ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 220219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221219fbbafSJunchao Zhang if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222219fbbafSJunchao Zhang } 223219fbbafSJunchao Zhang } 224219fbbafSJunchao Zhang } 225219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 226219fbbafSJunchao Zhang ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227219fbbafSJunchao Zhang 228219fbbafSJunchao Zhang if (coo_basic) { 229219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230219fbbafSJunchao Zhang } else { 231219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232219fbbafSJunchao Zhang ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233219fbbafSJunchao Zhang ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234219fbbafSJunchao Zhang ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238219fbbafSJunchao Zhang ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239219fbbafSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 241219fbbafSJunchao Zhang mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 242*baf1f5d7SJed Brown ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr); 243*baf1f5d7SJed Brown ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr); 244*baf1f5d7SJed Brown ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 245219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 246219fbbafSJunchao Zhang 247219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 248219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 249219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 250219fbbafSJunchao Zhang 251219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 252219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 253219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 254219fbbafSJunchao Zhang 255219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 256219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 257219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 258219fbbafSJunchao Zhang 259219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 260219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 261219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 262219fbbafSJunchao Zhang 263219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 264219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 265219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 266219fbbafSJunchao Zhang 267219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 269219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270219fbbafSJunchao Zhang 271219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 273219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274219fbbafSJunchao Zhang 275219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 277219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278219fbbafSJunchao Zhang 279219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 281219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282219fbbafSJunchao Zhang 283219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 284219fbbafSJunchao Zhang } 285219fbbafSJunchao Zhang PetscFunctionReturn(0); 286219fbbafSJunchao Zhang } 287219fbbafSJunchao Zhang 288219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 289219fbbafSJunchao Zhang { 290219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 291219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 292219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 293219fbbafSJunchao Zhang } 294219fbbafSJunchao Zhang 295219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 296219fbbafSJunchao Zhang { 297219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 298219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 299219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 300219fbbafSJunchao Zhang } 301219fbbafSJunchao Zhang 302219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 303219fbbafSJunchao Zhang { 304219fbbafSJunchao Zhang PetscErrorCode ierr; 305219fbbafSJunchao Zhang cudaError_t cerr; 306219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 307219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 308219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 309219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 310219fbbafSJunchao Zhang PetscScalar *Aa,*Ba; 311219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 312219fbbafSJunchao Zhang const PetscScalar *v1 = v; 313219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 314219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 315219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 316219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 317219fbbafSJunchao Zhang PetscMemType memtype; 318219fbbafSJunchao Zhang 319219fbbafSJunchao Zhang PetscFunctionBegin; 320219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 321219fbbafSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 322219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 323219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 3247487cd7cSJunchao Zhang cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 325219fbbafSJunchao Zhang } 326219fbbafSJunchao Zhang 327219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 328219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 329219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 3307487cd7cSJunchao Zhang cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 3317487cd7cSJunchao Zhang cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 332219fbbafSJunchao Zhang } else { 333219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 334219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 335219fbbafSJunchao Zhang } 336219fbbafSJunchao Zhang 337219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 338219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 339219fbbafSJunchao Zhang 340219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 341219fbbafSJunchao Zhang ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 342219fbbafSJunchao Zhang /* Add local entries to A and B */ 343219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 344219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 345219fbbafSJunchao Zhang ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 346219fbbafSJunchao Zhang 347219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 348219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 349219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 350219fbbafSJunchao Zhang 351219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 352219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 353219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 354219fbbafSJunchao Zhang } else { 355219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 356219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 357219fbbafSJunchao Zhang } 358219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 359219fbbafSJunchao Zhang } else { 360219fbbafSJunchao Zhang ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 361219fbbafSJunchao Zhang } 362219fbbafSJunchao Zhang PetscFunctionReturn(0); 363219fbbafSJunchao Zhang } 364219fbbafSJunchao Zhang 365ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 366ed502f03SStefano Zampini { 367ed502f03SStefano Zampini Mat Ad,Ao; 368ed502f03SStefano Zampini const PetscInt *cmap; 369ed502f03SStefano Zampini PetscErrorCode ierr; 370ed502f03SStefano Zampini 371ed502f03SStefano Zampini PetscFunctionBegin; 372ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 373ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 374ed502f03SStefano Zampini if (glob) { 375ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 376ed502f03SStefano Zampini 377ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 378ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 379ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 380ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 381ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 382ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 383ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 384ed502f03SStefano Zampini } 385ed502f03SStefano Zampini PetscFunctionReturn(0); 386ed502f03SStefano Zampini } 387ed502f03SStefano Zampini 3889ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 3899ae82921SPaul Mullowney { 390bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 391bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 3929ae82921SPaul Mullowney PetscErrorCode ierr; 3939ae82921SPaul Mullowney PetscInt i; 3949ae82921SPaul Mullowney 3959ae82921SPaul Mullowney PetscFunctionBegin; 3969ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 3979ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 3987e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 3999ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4002c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4019ae82921SPaul Mullowney } 4029ae82921SPaul Mullowney } 4037e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4049ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4052c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4069ae82921SPaul Mullowney } 4079ae82921SPaul Mullowney } 4086a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4096a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 4106a29ce69SStefano Zampini #else 4116a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 4126a29ce69SStefano Zampini #endif 4136a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 4146a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 4156a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 4166a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4176a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 4186a29ce69SStefano Zampini if (!b->A) { 4199ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 4209ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 4213bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 4226a29ce69SStefano Zampini } 4236a29ce69SStefano Zampini if (!b->B) { 4246a29ce69SStefano Zampini PetscMPIInt size; 42555b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 4269ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 4276a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 4283bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 4299ae82921SPaul Mullowney } 430d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 431d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4326a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 4336a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 4349ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 4359ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 436e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 437e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 438b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 439b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 440a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 441b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 442b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 443a0e72f99SJunchao Zhang */ 4449ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4459ae82921SPaul Mullowney PetscFunctionReturn(0); 4469ae82921SPaul Mullowney } 447e057df02SPaul Mullowney 4489ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4499ae82921SPaul Mullowney { 4509ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4519ae82921SPaul Mullowney PetscErrorCode ierr; 4529ae82921SPaul Mullowney PetscInt nt; 4539ae82921SPaul Mullowney 4549ae82921SPaul Mullowney PetscFunctionBegin; 4559ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4562c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 45756258e06SRichard Tran Mills /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 45856258e06SRichard Tran Mills if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 4599ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4604d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 4619ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4629ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 4639ae82921SPaul Mullowney PetscFunctionReturn(0); 4649ae82921SPaul Mullowney } 465ca45077fSPaul Mullowney 4663fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4673fa6b06aSMark Adams { 4683fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4693fa6b06aSMark Adams PetscErrorCode ierr; 4703fa6b06aSMark Adams 4713fa6b06aSMark Adams PetscFunctionBegin; 4723fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 4733fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 4743fa6b06aSMark Adams PetscFunctionReturn(0); 4753fa6b06aSMark Adams } 476042217e8SBarry Smith 477fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 478fdc842d1SBarry Smith { 479fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 480fdc842d1SBarry Smith PetscErrorCode ierr; 481fdc842d1SBarry Smith PetscInt nt; 482fdc842d1SBarry Smith 483fdc842d1SBarry Smith PetscFunctionBegin; 484fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4852c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 486fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4874d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 488fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 489fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 490fdc842d1SBarry Smith PetscFunctionReturn(0); 491fdc842d1SBarry Smith } 492fdc842d1SBarry Smith 493ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 494ca45077fSPaul Mullowney { 495ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 496ca45077fSPaul Mullowney PetscErrorCode ierr; 497ca45077fSPaul Mullowney PetscInt nt; 498ca45077fSPaul Mullowney 499ca45077fSPaul Mullowney PetscFunctionBegin; 500ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 5012c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 5029b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 503ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 5049b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 5059b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 506ca45077fSPaul Mullowney PetscFunctionReturn(0); 507ca45077fSPaul Mullowney } 5089ae82921SPaul Mullowney 509e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 510ca45077fSPaul Mullowney { 511ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 513e057df02SPaul Mullowney 514ca45077fSPaul Mullowney PetscFunctionBegin; 515e057df02SPaul Mullowney switch (op) { 516e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 517e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 518045c96e1SPaul Mullowney break; 519e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 520e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 521045c96e1SPaul Mullowney break; 522e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 523e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 524e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 525045c96e1SPaul Mullowney break; 526e057df02SPaul Mullowney default: 52798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 528045c96e1SPaul Mullowney } 529ca45077fSPaul Mullowney PetscFunctionReturn(0); 530ca45077fSPaul Mullowney } 531e057df02SPaul Mullowney 5324416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 533e057df02SPaul Mullowney { 534e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 535e057df02SPaul Mullowney PetscErrorCode ierr; 536e057df02SPaul Mullowney PetscBool flg; 537a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 538a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5395fd66863SKarl Rupp 540e057df02SPaul Mullowney PetscFunctionBegin; 541e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 542e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 543e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 544a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 545e057df02SPaul Mullowney if (flg) { 546e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 547e057df02SPaul Mullowney } 548e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 549a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 550e057df02SPaul Mullowney if (flg) { 551e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 552e057df02SPaul Mullowney } 553e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 554a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 555e057df02SPaul Mullowney if (flg) { 556e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 557e057df02SPaul Mullowney } 558e057df02SPaul Mullowney } 5590af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 560e057df02SPaul Mullowney PetscFunctionReturn(0); 561e057df02SPaul Mullowney } 562e057df02SPaul Mullowney 56334d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 56434d6c7a5SJose E. Roman { 56534d6c7a5SJose E. Roman PetscErrorCode ierr; 5663fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 567042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 568042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 569042217e8SBarry Smith 57034d6c7a5SJose E. Roman PetscFunctionBegin; 57134d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 572042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 573042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 574042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 575042217e8SBarry Smith cudaError_t cerr; 576042217e8SBarry Smith 577042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 578042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 579042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 580042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 58199551766SMark Adams if (h_mat->allocated_indices) { 58299551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 58399551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 58499551766SMark Adams if (h_mat->offdiag.j) { 58599551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 58699551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 58799551766SMark Adams } 58899551766SMark Adams } 589042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 590042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 591042217e8SBarry Smith cusp->deviceMat = NULL; 5923fa6b06aSMark Adams } 59334d6c7a5SJose E. Roman PetscFunctionReturn(0); 59434d6c7a5SJose E. Roman } 59534d6c7a5SJose E. Roman 596bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 597bbf3fe20SPaul Mullowney { 598bbf3fe20SPaul Mullowney PetscErrorCode ierr; 599219fbbafSJunchao Zhang cudaError_t cerr; 6003fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6013fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 602b06137fdSPaul Mullowney cusparseStatus_t stat; 603bbf3fe20SPaul Mullowney 604bbf3fe20SPaul Mullowney PetscFunctionBegin; 6052c71b3e2SJacob Faibussowitsch PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6063fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 607042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 608042217e8SBarry Smith 6093fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 610042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 611042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 612042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 61399551766SMark Adams if (h_mat->allocated_indices) { 61499551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 61599551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 61699551766SMark Adams if (h_mat->offdiag.j) { 61799551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 61899551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 61999551766SMark Adams } 62099551766SMark Adams } 621042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 622042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 6233fa6b06aSMark Adams } 624bbf3fe20SPaul Mullowney try { 6253fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 6263fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 62757d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 628a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 62917403302SKarl Rupp if (cusparseStruct->stream) { 630042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 63117403302SKarl Rupp } 632a0e72f99SJunchao Zhang */ 633219fbbafSJunchao Zhang /* Extended COO */ 634219fbbafSJunchao Zhang if (cusparseStruct->use_extended_coo) { 635219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 636219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 637219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 638219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 639219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 640219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 641219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 642219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 643219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 644219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 645219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 646219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 647219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 648219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 649219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 650219fbbafSJunchao Zhang } else { 6517e8381f9SStefano Zampini delete cusparseStruct->coo_p; 6527e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 653219fbbafSJunchao Zhang } 654bbf3fe20SPaul Mullowney delete cusparseStruct; 655bbf3fe20SPaul Mullowney } catch(char *ex) { 65698921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 657bbf3fe20SPaul Mullowney } 6583338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 659ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 6607e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 6617e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 6623338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 663ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 664bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 665bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 666bbf3fe20SPaul Mullowney } 667ca45077fSPaul Mullowney 6683338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6699ae82921SPaul Mullowney { 6709ae82921SPaul Mullowney PetscErrorCode ierr; 671bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 672b06137fdSPaul Mullowney cusparseStatus_t stat; 6733338378cSStefano Zampini Mat A; 6749ae82921SPaul Mullowney 6759ae82921SPaul Mullowney PetscFunctionBegin; 676219fbbafSJunchao Zhang ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 6773338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 6783338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 6793338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 6803338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 6813338378cSStefano Zampini } 6823338378cSStefano Zampini A = *newmat; 6836f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 68434136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 68534136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 68634136279SStefano Zampini 687bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 688d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 689d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 690d98d7c49SStefano Zampini if (a->lvec) { 691d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 692d98d7c49SStefano Zampini } 693d98d7c49SStefano Zampini 6943338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 695219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 696219fbbafSJunchao Zhang stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 697219fbbafSJunchao Zhang a->spptr = cusp; 6983338378cSStefano Zampini } 6992205254eSKarl Rupp 70034d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 701bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 702fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 703bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 704bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 705bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 7063fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 7074e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 7082205254eSKarl Rupp 709bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 710ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 7113338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 712bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 7137e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 7147e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 715ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 716ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 717ae48a8d0SStefano Zampini #endif 7189ae82921SPaul Mullowney PetscFunctionReturn(0); 7199ae82921SPaul Mullowney } 7209ae82921SPaul Mullowney 7213338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 7223338378cSStefano Zampini { 7233338378cSStefano Zampini PetscErrorCode ierr; 7243338378cSStefano Zampini 7253338378cSStefano Zampini PetscFunctionBegin; 726a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 7273338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 7283338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 7293338378cSStefano Zampini PetscFunctionReturn(0); 7303338378cSStefano Zampini } 7313338378cSStefano Zampini 7323f9c0db1SPaul Mullowney /*@ 7333f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 734e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 7353f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 736e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 737e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 738e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7399ae82921SPaul Mullowney 740d083f849SBarry Smith Collective 741e057df02SPaul Mullowney 742e057df02SPaul Mullowney Input Parameters: 743e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 744e057df02SPaul Mullowney . m - number of rows 745e057df02SPaul Mullowney . n - number of columns 746e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 747e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7480298fd71SBarry Smith (possibly different for each row) or NULL 749e057df02SPaul Mullowney 750e057df02SPaul Mullowney Output Parameter: 751e057df02SPaul Mullowney . A - the matrix 752e057df02SPaul Mullowney 753e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 754e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 755e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 756e057df02SPaul Mullowney 757e057df02SPaul Mullowney Notes: 758e057df02SPaul Mullowney If nnz is given then nz is ignored 759e057df02SPaul Mullowney 760e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 761e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 762e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 763e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 764e057df02SPaul Mullowney 765e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7660298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 767e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 768e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 769e057df02SPaul Mullowney 770e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 771e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 772e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 773e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 774e057df02SPaul Mullowney 775e057df02SPaul Mullowney Level: intermediate 776e057df02SPaul Mullowney 777e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 778e057df02SPaul Mullowney @*/ 7799ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7809ae82921SPaul Mullowney { 7819ae82921SPaul Mullowney PetscErrorCode ierr; 7829ae82921SPaul Mullowney PetscMPIInt size; 7839ae82921SPaul Mullowney 7849ae82921SPaul Mullowney PetscFunctionBegin; 7859ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7869ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 787ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 7889ae82921SPaul Mullowney if (size > 1) { 7899ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 7909ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 7919ae82921SPaul Mullowney } else { 7929ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7939ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 7949ae82921SPaul Mullowney } 7959ae82921SPaul Mullowney PetscFunctionReturn(0); 7969ae82921SPaul Mullowney } 7979ae82921SPaul Mullowney 7983ca39a21SBarry Smith /*MC 7996bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 800e057df02SPaul Mullowney 8012692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 8022692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 8032692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 8049ae82921SPaul Mullowney 8059ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 8069ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 8079ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 8089ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 8099ae82921SPaul Mullowney the above preallocation routines for simplicity. 8109ae82921SPaul Mullowney 8119ae82921SPaul Mullowney Options Database Keys: 812e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 8138468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8148468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8158468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8169ae82921SPaul Mullowney 8179ae82921SPaul Mullowney Level: beginner 8189ae82921SPaul Mullowney 8196bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 8206bb69460SJunchao Zhang M*/ 8216bb69460SJunchao Zhang 8226bb69460SJunchao Zhang /*MC 8236bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 8246bb69460SJunchao Zhang 8256bb69460SJunchao Zhang Level: beginner 8266bb69460SJunchao Zhang 8276bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 8289ae82921SPaul Mullowney M*/ 8293fa6b06aSMark Adams 830042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 831042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 8323fa6b06aSMark Adams { 833042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 834042217e8SBarry Smith PetscMPIInt size; 8353fa6b06aSMark Adams PetscErrorCode ierr; 836042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 837042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 83899551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 839042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 840042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 8413fa6b06aSMark Adams 8423fa6b06aSMark Adams PetscFunctionBegin; 8432c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 844042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8453fa6b06aSMark Adams *B = NULL; 8463fa6b06aSMark Adams PetscFunctionReturn(0); 8473fa6b06aSMark Adams } 848042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 84999551766SMark Adams // get jaca 8503fa6b06aSMark Adams if (size == 1) { 851042217e8SBarry Smith PetscBool isseqaij; 852042217e8SBarry Smith 853042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 854042217e8SBarry Smith if (isseqaij) { 8553fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 8562c71b3e2SJacob Faibussowitsch PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 857042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 858042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 859042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 8603fa6b06aSMark Adams } else { 8613fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8622c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 863042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8643fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 865042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 866042217e8SBarry Smith d_mat = spptr->deviceMat; 867042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 8683fa6b06aSMark Adams } 869042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 870042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 8712c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 872042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 873042217e8SBarry Smith bi = NULL; 874042217e8SBarry Smith bj = NULL; 875042217e8SBarry Smith ba = NULL; 876042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 877042217e8SBarry Smith } else { 878042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8792c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 880042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 88199551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 882042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8833fa6b06aSMark Adams 8842c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 885042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 886042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 8872c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 888042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 889042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 890042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 891042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 892042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 8932c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 8942c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 895042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 896042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8973fa6b06aSMark Adams if (jacb->compressedrow.use) { 898042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 899042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 900042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 9013fa6b06aSMark Adams } 902042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 903042217e8SBarry Smith } else { 904042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 905042217e8SBarry Smith } 906042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 907042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 908042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 909042217e8SBarry Smith d_mat = spptr->deviceMat; 910042217e8SBarry Smith } 9113fa6b06aSMark Adams if (jaca->compressedrow.use) { 912042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 913042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 914042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 9153fa6b06aSMark Adams } 916042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 9173fa6b06aSMark Adams } else { 918042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 9193fa6b06aSMark Adams } 920042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 921042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 922042217e8SBarry Smith 923042217e8SBarry Smith if (!d_mat) { 924042217e8SBarry Smith cudaError_t cerr; 925042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 926042217e8SBarry Smith 927042217e8SBarry Smith // create and populate strucy on host and copy on device 928042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 929042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 930042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 931042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 932042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 93399551766SMark Adams PetscInt *colmap; 934042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 935042217e8SBarry Smith 9362c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 937042217e8SBarry Smith 938042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 939042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 940365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 941365b711fSMark Adams { // have to make a long version of these 94299551766SMark Adams int *h_bi32, *h_bj32; 94399551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 944365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 94599551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94699551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 94799551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94899551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 94999551766SMark Adams 95099551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 95199551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95299551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 95399551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95499551766SMark Adams 95599551766SMark Adams h_mat->offdiag.i = d_bi64; 95699551766SMark Adams h_mat->offdiag.j = d_bj64; 95799551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 95899551766SMark Adams 959365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 960365b711fSMark Adams } 961365b711fSMark Adams #else 96299551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 96399551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 96499551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 965365b711fSMark Adams #endif 966042217e8SBarry Smith h_mat->offdiag.a = ba; 967042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 968042217e8SBarry Smith 96999551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 97099551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 971042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 972042217e8SBarry Smith } 973042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 974042217e8SBarry Smith h_mat->rend = A->rmap->rend; 975042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 976042217e8SBarry Smith h_mat->cend = A->cmap->rend; 97749b994a9SMark Adams h_mat->M = A->cmap->N; 978365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 979365b711fSMark Adams { 9802c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 98199551766SMark Adams int *h_ai32, *h_aj32; 98299551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 983365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 98499551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98599551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 98699551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98799551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 98899551766SMark Adams 98999551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 99099551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99199551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 99299551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99399551766SMark Adams 99499551766SMark Adams h_mat->diag.i = d_ai64; 99599551766SMark Adams h_mat->diag.j = d_aj64; 99699551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 99799551766SMark Adams 998365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 999365b711fSMark Adams } 1000365b711fSMark Adams #else 100199551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 100299551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 100399551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 1004365b711fSMark Adams #endif 1005042217e8SBarry Smith h_mat->diag.a = aa; 1006042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 1007042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 1008042217e8SBarry Smith // copy pointers and metadata to device 1009042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1010042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 1011042217e8SBarry Smith } else { 1012042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1013042217e8SBarry Smith } 1014042217e8SBarry Smith *B = d_mat; 1015042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 10163fa6b06aSMark Adams PetscFunctionReturn(0); 10173fa6b06aSMark Adams } 1018