1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 97e8381f9SStefano Zampini #include <thrust/advance.h> 104eb5378fSStefano Zampini #include <petscsf.h> 117e8381f9SStefano Zampini 127e8381f9SStefano Zampini struct VecCUDAEquals 137e8381f9SStefano Zampini { 147e8381f9SStefano Zampini template <typename Tuple> 157e8381f9SStefano Zampini __host__ __device__ 167e8381f9SStefano Zampini void operator()(Tuple t) 177e8381f9SStefano Zampini { 187e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 197e8381f9SStefano Zampini } 207e8381f9SStefano Zampini }; 217e8381f9SStefano Zampini 227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 237e8381f9SStefano Zampini { 247e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 257e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 267e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 277e8381f9SStefano Zampini PetscErrorCode ierr; 287e8381f9SStefano Zampini cudaError_t cerr; 297e8381f9SStefano Zampini 307e8381f9SStefano Zampini PetscFunctionBegin; 317e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 32e61fc153SStefano Zampini if (cusp->coo_p && v) { 3308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3408391a17SStefano Zampini THRUSTARRAY *w = NULL; 3508391a17SStefano Zampini 3608391a17SStefano Zampini if (isCudaMem(v)) { 3708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3808391a17SStefano Zampini } else { 39e61fc153SStefano Zampini w = new THRUSTARRAY(n); 40e61fc153SStefano Zampini w->assign(v,v+n); 4108391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4208391a17SStefano Zampini d_v = w->data(); 4308391a17SStefano Zampini } 4408391a17SStefano Zampini 4508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 467e8381f9SStefano Zampini cusp->coo_pw->begin())); 4708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 487e8381f9SStefano Zampini cusp->coo_pw->end())); 4908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5108391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 5208391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 53e61fc153SStefano Zampini delete w; 547e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 567e8381f9SStefano Zampini } else { 577e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 597e8381f9SStefano Zampini } 607e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr); 617e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 627e8381f9SStefano Zampini A->num_ass++; 637e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 647e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 654eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 667e8381f9SStefano Zampini PetscFunctionReturn(0); 677e8381f9SStefano Zampini } 687e8381f9SStefano Zampini 697e8381f9SStefano Zampini template <typename Tuple> 707e8381f9SStefano Zampini struct IsNotOffDiagT 717e8381f9SStefano Zampini { 727e8381f9SStefano Zampini PetscInt _cstart,_cend; 737e8381f9SStefano Zampini 747e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 757e8381f9SStefano Zampini __host__ __device__ 767e8381f9SStefano Zampini inline bool operator()(Tuple t) 777e8381f9SStefano Zampini { 787e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 797e8381f9SStefano Zampini } 807e8381f9SStefano Zampini }; 817e8381f9SStefano Zampini 827e8381f9SStefano Zampini struct IsOffDiag 837e8381f9SStefano Zampini { 847e8381f9SStefano Zampini PetscInt _cstart,_cend; 857e8381f9SStefano Zampini 867e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 877e8381f9SStefano Zampini __host__ __device__ 887e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 897e8381f9SStefano Zampini { 907e8381f9SStefano Zampini return c < _cstart || c >= _cend; 917e8381f9SStefano Zampini } 927e8381f9SStefano Zampini }; 937e8381f9SStefano Zampini 947e8381f9SStefano Zampini struct GlobToLoc 957e8381f9SStefano Zampini { 967e8381f9SStefano Zampini PetscInt _start; 977e8381f9SStefano Zampini 987e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 997e8381f9SStefano Zampini __host__ __device__ 1007e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 1017e8381f9SStefano Zampini { 1027e8381f9SStefano Zampini return c - _start; 1037e8381f9SStefano Zampini } 1047e8381f9SStefano Zampini }; 1057e8381f9SStefano Zampini 1067e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1077e8381f9SStefano Zampini { 1087e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1097e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1107e8381f9SStefano Zampini PetscErrorCode ierr; 1117e8381f9SStefano Zampini PetscInt *jj; 1127e8381f9SStefano Zampini size_t noff = 0; 1137e8381f9SStefano Zampini THRUSTINTARRAY d_i(n); 1147e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1157e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1167e8381f9SStefano Zampini cudaError_t cerr; 1177e8381f9SStefano Zampini 1187e8381f9SStefano Zampini PetscFunctionBegin; 1197e8381f9SStefano Zampini ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1227e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1237e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1247e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1257e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1267e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1277e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1287e8381f9SStefano Zampini 1297e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1307e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1317e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1327e8381f9SStefano Zampini delete cusp->coo_p; 1337e8381f9SStefano Zampini delete cusp->coo_pw; 1347e8381f9SStefano Zampini cusp->coo_p = NULL; 1357e8381f9SStefano Zampini cusp->coo_pw = NULL; 13608391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1377e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1387e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1397e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1407e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1417e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1427e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1437e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1447e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1457e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1467e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1477e8381f9SStefano Zampini } 1487e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1497e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1507e8381f9SStefano Zampini 1517e8381f9SStefano Zampini /* from global to local */ 1527e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1537e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 15408391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 15508391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1567e8381f9SStefano Zampini 1577e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1587e8381f9SStefano Zampini ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); 1597e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1607e8381f9SStefano Zampini auto o_j = d_j.begin(); 16108391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1627e8381f9SStefano Zampini thrust::advance(o_j,cusp->coo_nd); 1637e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 1647e8381f9SStefano Zampini auto wit = thrust::unique(thrust::device,o_j,d_j.end()); 16508391a17SStefano Zampini cerr = WaitForCUDA();CHKERRCUDA(cerr); 16608391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1677e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 1687e8381f9SStefano Zampini ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr); 1697e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1707e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 1747e8381f9SStefano Zampini if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 1757e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1767e8381f9SStefano Zampini 1777e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1787e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1797e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1807e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1817e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1827e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1837e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1847e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1857e8381f9SStefano Zampini 1867e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1877e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1887e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1897e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1907e8381f9SStefano Zampini 1917e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1927e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1937e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1947e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 1957e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1967e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 1977e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1987e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1997e8381f9SStefano Zampini B->nonzerostate++; 2007e8381f9SStefano Zampini ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr); 2017e8381f9SStefano Zampini 2027e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2037e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2047e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 2057e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2067e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2077e8381f9SStefano Zampini PetscFunctionReturn(0); 2087e8381f9SStefano Zampini } 2099ae82921SPaul Mullowney 210ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 211ed502f03SStefano Zampini { 212ed502f03SStefano Zampini Mat Ad,Ao; 213ed502f03SStefano Zampini const PetscInt *cmap; 214ed502f03SStefano Zampini PetscErrorCode ierr; 215ed502f03SStefano Zampini 216ed502f03SStefano Zampini PetscFunctionBegin; 217ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 218ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 219ed502f03SStefano Zampini if (glob) { 220ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 221ed502f03SStefano Zampini 222ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 223ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 224ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 225ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 226ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 227ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 228ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 229ed502f03SStefano Zampini } 230ed502f03SStefano Zampini PetscFunctionReturn(0); 231ed502f03SStefano Zampini } 232ed502f03SStefano Zampini 2339ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 2349ae82921SPaul Mullowney { 235bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 236bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 2379ae82921SPaul Mullowney PetscErrorCode ierr; 2389ae82921SPaul Mullowney PetscInt i; 2399ae82921SPaul Mullowney 2409ae82921SPaul Mullowney PetscFunctionBegin; 2419ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2429ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2437e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2449ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2459ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 2469ae82921SPaul Mullowney } 2479ae82921SPaul Mullowney } 2487e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2499ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2509ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 2519ae82921SPaul Mullowney } 2529ae82921SPaul Mullowney } 2539ae82921SPaul Mullowney if (!B->preallocated) { 254bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 2559ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 256b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2579ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2583bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2599ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 260b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2619ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 2623bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2639ae82921SPaul Mullowney } 264d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 265d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 266d98d7c49SStefano Zampini if (b->lvec) { 267d98d7c49SStefano Zampini ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 268d98d7c49SStefano Zampini } 2699ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2709ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 271e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 272e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 273b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 274b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 275b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 276b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 2772205254eSKarl Rupp 2789ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2799ae82921SPaul Mullowney PetscFunctionReturn(0); 2809ae82921SPaul Mullowney } 281e057df02SPaul Mullowney 2824eb5378fSStefano Zampini typedef struct { 2834eb5378fSStefano Zampini Mat *mp; /* intermediate products */ 2844eb5378fSStefano Zampini PetscBool *mptmp; /* is the intermediate product temporary ? */ 2854eb5378fSStefano Zampini PetscInt cp; /* number of intermediate products */ 2864eb5378fSStefano Zampini 2874eb5378fSStefano Zampini /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */ 2884eb5378fSStefano Zampini PetscInt *startsj_s,*startsj_r; 2894eb5378fSStefano Zampini PetscScalar *bufa; 2904eb5378fSStefano Zampini Mat P_oth; 2914eb5378fSStefano Zampini 2924eb5378fSStefano Zampini /* may take advantage of merging product->B */ 2934eb5378fSStefano Zampini Mat Bloc; 2944eb5378fSStefano Zampini 2954eb5378fSStefano Zampini /* cusparse does not have support to split between symbolic and numeric phases 2964eb5378fSStefano Zampini When api_user is true, we don't need to update the numerical values 2974eb5378fSStefano Zampini of the temporary storage */ 2984eb5378fSStefano Zampini PetscBool reusesym; 2994eb5378fSStefano Zampini 3004eb5378fSStefano Zampini /* support for COO values insertion */ 3014eb5378fSStefano Zampini PetscScalar *coo_v,*coo_w; 3024eb5378fSStefano Zampini PetscSF sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */ 3034eb5378fSStefano Zampini } MatMatMPIAIJCUSPARSE; 3044eb5378fSStefano Zampini 3054eb5378fSStefano Zampini PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data) 3064eb5378fSStefano Zampini { 3074eb5378fSStefano Zampini MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data; 3084eb5378fSStefano Zampini PetscInt i; 3094eb5378fSStefano Zampini PetscErrorCode ierr; 3104eb5378fSStefano Zampini 3114eb5378fSStefano Zampini PetscFunctionBegin; 3124eb5378fSStefano Zampini ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr); 3134eb5378fSStefano Zampini ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr); 3144eb5378fSStefano Zampini ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr); 3154eb5378fSStefano Zampini ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr); 3164eb5378fSStefano Zampini ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr); 3174eb5378fSStefano Zampini ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr); 3184eb5378fSStefano Zampini ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr); 3194eb5378fSStefano Zampini for (i = 0; i < mmdata->cp; i++) { 3204eb5378fSStefano Zampini ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr); 3214eb5378fSStefano Zampini } 3224eb5378fSStefano Zampini ierr = PetscFree(mmdata->mp);CHKERRQ(ierr); 3234eb5378fSStefano Zampini ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr); 3244eb5378fSStefano Zampini ierr = PetscFree(mmdata);CHKERRQ(ierr); 3254eb5378fSStefano Zampini PetscFunctionReturn(0); 3264eb5378fSStefano Zampini } 3274eb5378fSStefano Zampini 3284eb5378fSStefano Zampini static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 3294eb5378fSStefano Zampini { 3304eb5378fSStefano Zampini MatMatMPIAIJCUSPARSE *mmdata; 3314eb5378fSStefano Zampini PetscScalar *tmp; 3324eb5378fSStefano Zampini PetscInt i,n; 3334eb5378fSStefano Zampini PetscErrorCode ierr; 3344eb5378fSStefano Zampini 3354eb5378fSStefano Zampini PetscFunctionBegin; 3364eb5378fSStefano Zampini MatCheckProduct(C,1); 3374eb5378fSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 3384eb5378fSStefano Zampini mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data; 3394eb5378fSStefano Zampini tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v; 3404eb5378fSStefano Zampini if (!mmdata->reusesym) { /* update temporary matrices */ 3414eb5378fSStefano Zampini if (mmdata->P_oth) { 3424eb5378fSStefano Zampini ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 3434eb5378fSStefano Zampini } 3444eb5378fSStefano Zampini if (mmdata->Bloc) { 3454eb5378fSStefano Zampini ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr); 3464eb5378fSStefano Zampini } 3474eb5378fSStefano Zampini } 3484eb5378fSStefano Zampini mmdata->reusesym = PETSC_FALSE; 3494eb5378fSStefano Zampini for (i = 0, n = 0; i < mmdata->cp; i++) { 3504eb5378fSStefano Zampini Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data; 3514eb5378fSStefano Zampini const PetscScalar *vv; 3524eb5378fSStefano Zampini 3534eb5378fSStefano Zampini if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]); 3544eb5378fSStefano Zampini ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr); 3554eb5378fSStefano Zampini if (mmdata->mptmp[i]) continue; 3564eb5378fSStefano Zampini /* TODO: add support for using GPU data directly */ 3574eb5378fSStefano Zampini ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 3584eb5378fSStefano Zampini ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr); 3594eb5378fSStefano Zampini ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr); 3604eb5378fSStefano Zampini n += mm->nz; 3614eb5378fSStefano Zampini } 3624eb5378fSStefano Zampini if (mmdata->sf) { /* offprocess insertion */ 3634eb5378fSStefano Zampini ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 3644eb5378fSStefano Zampini ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr); 3654eb5378fSStefano Zampini } 3664eb5378fSStefano Zampini ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr); 3674eb5378fSStefano Zampini PetscFunctionReturn(0); 3684eb5378fSStefano Zampini } 3694eb5378fSStefano Zampini 3704eb5378fSStefano Zampini /* Pt * A or A * P */ 3714eb5378fSStefano Zampini #define MAX_NUMBER_INTERMEDIATE 4 3724eb5378fSStefano Zampini static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C) 3734eb5378fSStefano Zampini { 3744eb5378fSStefano Zampini Mat_Product *product = C->product; 3754eb5378fSStefano Zampini Mat A,P,mp[MAX_NUMBER_INTERMEDIATE]; 3764eb5378fSStefano Zampini Mat_MPIAIJ *a,*p; 3774eb5378fSStefano Zampini MatMatMPIAIJCUSPARSE *mmdata; 3784eb5378fSStefano Zampini ISLocalToGlobalMapping P_oth_l2g = NULL; 3794eb5378fSStefano Zampini IS glob = NULL; 3804eb5378fSStefano Zampini const PetscInt *globidx,*P_oth_idx; 3814eb5378fSStefano Zampini const PetscInt *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE]; 3824eb5378fSStefano Zampini PetscInt cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j; 3834eb5378fSStefano Zampini MatProductType ptype; 3844eb5378fSStefano Zampini PetscBool mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE; 3854eb5378fSStefano Zampini PetscMPIInt size; 3864eb5378fSStefano Zampini PetscErrorCode ierr; 3874eb5378fSStefano Zampini 3884eb5378fSStefano Zampini PetscFunctionBegin; 3894eb5378fSStefano Zampini MatCheckProduct(C,1); 3904eb5378fSStefano Zampini if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 3914eb5378fSStefano Zampini ptype = product->type; 3924eb5378fSStefano Zampini if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB; 3934eb5378fSStefano Zampini switch (ptype) { 3944eb5378fSStefano Zampini case MATPRODUCT_AB: 3954eb5378fSStefano Zampini A = product->A; 3964eb5378fSStefano Zampini P = product->B; 3974eb5378fSStefano Zampini m = A->rmap->n; 3984eb5378fSStefano Zampini n = P->cmap->n; 3994eb5378fSStefano Zampini M = A->rmap->N; 4004eb5378fSStefano Zampini N = P->cmap->N; 4014eb5378fSStefano Zampini break; 4024eb5378fSStefano Zampini case MATPRODUCT_AtB: 4034eb5378fSStefano Zampini P = product->A; 4044eb5378fSStefano Zampini A = product->B; 4054eb5378fSStefano Zampini m = P->cmap->n; 4064eb5378fSStefano Zampini n = A->cmap->n; 4074eb5378fSStefano Zampini M = P->cmap->N; 4084eb5378fSStefano Zampini N = A->cmap->N; 4094eb5378fSStefano Zampini hasoffproc = PETSC_TRUE; 4104eb5378fSStefano Zampini break; 4114eb5378fSStefano Zampini case MATPRODUCT_PtAP: 4124eb5378fSStefano Zampini A = product->A; 4134eb5378fSStefano Zampini P = product->B; 4144eb5378fSStefano Zampini m = P->cmap->n; 4154eb5378fSStefano Zampini n = P->cmap->n; 4164eb5378fSStefano Zampini M = P->cmap->N; 4174eb5378fSStefano Zampini N = P->cmap->N; 4184eb5378fSStefano Zampini hasoffproc = PETSC_TRUE; 4194eb5378fSStefano Zampini break; 4204eb5378fSStefano Zampini default: 4214eb5378fSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 4224eb5378fSStefano Zampini } 4234eb5378fSStefano Zampini ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr); 4244eb5378fSStefano Zampini if (size == 1) hasoffproc = PETSC_FALSE; 4254eb5378fSStefano Zampini 4264eb5378fSStefano Zampini for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) { 4274eb5378fSStefano Zampini mp[i] = NULL; 4284eb5378fSStefano Zampini mptmp[i] = PETSC_FALSE; 4294eb5378fSStefano Zampini rmapt[i] = 0; 4304eb5378fSStefano Zampini cmapt[i] = 0; 4314eb5378fSStefano Zampini rmapa[i] = NULL; 4324eb5378fSStefano Zampini cmapa[i] = NULL; 4334eb5378fSStefano Zampini } 4344eb5378fSStefano Zampini 4354eb5378fSStefano Zampini a = (Mat_MPIAIJ*)A->data; 4364eb5378fSStefano Zampini p = (Mat_MPIAIJ*)P->data; 4374eb5378fSStefano Zampini ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr); 4384eb5378fSStefano Zampini ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr); 4394eb5378fSStefano Zampini ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr); 4404eb5378fSStefano Zampini ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 4414eb5378fSStefano Zampini ierr = PetscNew(&mmdata);CHKERRQ(ierr); 4424eb5378fSStefano Zampini mmdata->reusesym = product->api_user; 4434eb5378fSStefano Zampini switch (ptype) { 4444eb5378fSStefano Zampini case MATPRODUCT_AB: /* A * P */ 4454eb5378fSStefano Zampini ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 4464eb5378fSStefano Zampini 4474eb5378fSStefano Zampini if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */ 4484eb5378fSStefano Zampini /* P is product->B */ 4494eb5378fSStefano Zampini ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 4504eb5378fSStefano Zampini ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 4514eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 4524eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 4534eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 4544eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 4554eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 4564eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 4574eb5378fSStefano Zampini ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 4584eb5378fSStefano Zampini rmapt[cp] = 1; 4594eb5378fSStefano Zampini cmapt[cp] = 2; 4604eb5378fSStefano Zampini cmapa[cp] = globidx; 4614eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 4624eb5378fSStefano Zampini cp++; 4634eb5378fSStefano Zampini } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */ 4644eb5378fSStefano Zampini ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr); 4654eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 4664eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 4674eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 4684eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 4694eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 4704eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 4714eb5378fSStefano Zampini rmapt[cp] = 1; 4724eb5378fSStefano Zampini cmapt[cp] = 1; 4734eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 4744eb5378fSStefano Zampini cp++; 4754eb5378fSStefano Zampini ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr); 4764eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 4774eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 4784eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 4794eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 4804eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 4814eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 4824eb5378fSStefano Zampini rmapt[cp] = 1; 4834eb5378fSStefano Zampini cmapt[cp] = 2; 4844eb5378fSStefano Zampini cmapa[cp] = p->garray; 4854eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 4864eb5378fSStefano Zampini cp++; 4874eb5378fSStefano Zampini } 4884eb5378fSStefano Zampini if (mmdata->P_oth) { 4894eb5378fSStefano Zampini ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 4904eb5378fSStefano Zampini ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 4914eb5378fSStefano Zampini ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 4924eb5378fSStefano Zampini ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 4934eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 4944eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 4954eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 4964eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 4974eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 4984eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 4994eb5378fSStefano Zampini rmapt[cp] = 1; 5004eb5378fSStefano Zampini cmapt[cp] = 2; 5014eb5378fSStefano Zampini cmapa[cp] = P_oth_idx; 5024eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 5034eb5378fSStefano Zampini cp++; 5044eb5378fSStefano Zampini } 5054eb5378fSStefano Zampini break; 5064eb5378fSStefano Zampini case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */ 5074eb5378fSStefano Zampini /* A is product->B */ 5084eb5378fSStefano Zampini ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 5094eb5378fSStefano Zampini ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 5104eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 5114eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 5124eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 5134eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 5144eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 5154eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 5164eb5378fSStefano Zampini ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 5174eb5378fSStefano Zampini rmapt[cp] = 1; 5184eb5378fSStefano Zampini cmapt[cp] = 2; 5194eb5378fSStefano Zampini cmapa[cp] = globidx; 5204eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 5214eb5378fSStefano Zampini cp++; 5224eb5378fSStefano Zampini ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 5234eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 5244eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 5254eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 5264eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 5274eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 5284eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 5294eb5378fSStefano Zampini rmapt[cp] = 2; 5304eb5378fSStefano Zampini rmapa[cp] = p->garray; 5314eb5378fSStefano Zampini cmapt[cp] = 2; 5324eb5378fSStefano Zampini cmapa[cp] = globidx; 5334eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 5344eb5378fSStefano Zampini cp++; 5354eb5378fSStefano Zampini break; 5364eb5378fSStefano Zampini case MATPRODUCT_PtAP: 5374eb5378fSStefano Zampini ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr); 5384eb5378fSStefano Zampini /* P is product->B */ 5394eb5378fSStefano Zampini ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr); 5404eb5378fSStefano Zampini ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr); 5414eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr); 5424eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 5434eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 5444eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 5454eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 5464eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 5474eb5378fSStefano Zampini ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr); 5484eb5378fSStefano Zampini rmapt[cp] = 2; 5494eb5378fSStefano Zampini rmapa[cp] = globidx; 5504eb5378fSStefano Zampini cmapt[cp] = 2; 5514eb5378fSStefano Zampini cmapa[cp] = globidx; 5524eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 5534eb5378fSStefano Zampini cp++; 5544eb5378fSStefano Zampini if (mmdata->P_oth) { 5554eb5378fSStefano Zampini ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr); 5564eb5378fSStefano Zampini ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr); 5574eb5378fSStefano Zampini ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 5584eb5378fSStefano Zampini ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr); 5594eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr); 5604eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 5614eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 5624eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 5634eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 5644eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 5654eb5378fSStefano Zampini mptmp[cp] = PETSC_TRUE; 5664eb5378fSStefano Zampini cp++; 5674eb5378fSStefano Zampini ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr); 5684eb5378fSStefano Zampini ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr); 5694eb5378fSStefano Zampini ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr); 5704eb5378fSStefano Zampini mp[cp]->product->api_user = product->api_user; 5714eb5378fSStefano Zampini ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr); 5724eb5378fSStefano Zampini if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]); 5734eb5378fSStefano Zampini ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr); 5744eb5378fSStefano Zampini rmapt[cp] = 2; 5754eb5378fSStefano Zampini rmapa[cp] = globidx; 5764eb5378fSStefano Zampini cmapt[cp] = 2; 5774eb5378fSStefano Zampini cmapa[cp] = P_oth_idx; 5784eb5378fSStefano Zampini mptmp[cp] = PETSC_FALSE; 5794eb5378fSStefano Zampini cp++; 5804eb5378fSStefano Zampini } 5814eb5378fSStefano Zampini break; 5824eb5378fSStefano Zampini default: 5834eb5378fSStefano Zampini SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 5844eb5378fSStefano Zampini } 5854eb5378fSStefano Zampini ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr); 5864eb5378fSStefano Zampini for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i]; 5874eb5378fSStefano Zampini ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr); 5884eb5378fSStefano Zampini for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i]; 5894eb5378fSStefano Zampini mmdata->cp = cp; 5904eb5378fSStefano Zampini C->product->data = mmdata; 5914eb5378fSStefano Zampini C->product->destroy = MatDestroy_MatMatMPIAIJCUSPARSE; 5924eb5378fSStefano Zampini C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 5934eb5378fSStefano Zampini 5944eb5378fSStefano Zampini /* prepare coo coordinates for values insertion */ 5954eb5378fSStefano Zampini ncoo = 0; 5964eb5378fSStefano Zampini for (cp = 0; cp < mmdata->cp; cp++) { 5974eb5378fSStefano Zampini Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 5984eb5378fSStefano Zampini if (mptmp[cp]) continue; 5994eb5378fSStefano Zampini ncoo += mm->nz; 6004eb5378fSStefano Zampini } 6014eb5378fSStefano Zampini ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr); 6024eb5378fSStefano Zampini ncoo = 0; 6034eb5378fSStefano Zampini for (cp = 0; cp < mmdata->cp; cp++) { 6044eb5378fSStefano Zampini Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data; 6054eb5378fSStefano Zampini PetscInt *coi = coo_i + ncoo; 6064eb5378fSStefano Zampini PetscInt *coj = coo_j + ncoo; 6074eb5378fSStefano Zampini const PetscInt mr = mp[cp]->rmap->n; 6084eb5378fSStefano Zampini const PetscInt *jj = mm->j; 6094eb5378fSStefano Zampini const PetscInt *ii = mm->i; 6104eb5378fSStefano Zampini 6114eb5378fSStefano Zampini if (mptmp[cp]) continue; 6124eb5378fSStefano Zampini /* rows coo */ 6134eb5378fSStefano Zampini if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen"); 6144eb5378fSStefano Zampini else if (rmapt[cp] == 1) { /* local to global for owned rows of */ 6154eb5378fSStefano Zampini const PetscInt rs = C->rmap->rstart; 6164eb5378fSStefano Zampini for (i = 0; i < mr; i++) { 6174eb5378fSStefano Zampini const PetscInt gr = i + rs; 6184eb5378fSStefano Zampini for (j = ii[i]; j < ii[i+1]; j++) { 6194eb5378fSStefano Zampini coi[j] = gr; 6204eb5378fSStefano Zampini } 6214eb5378fSStefano Zampini } 6224eb5378fSStefano Zampini } else { /* offprocess */ 6234eb5378fSStefano Zampini const PetscInt *rmap = rmapa[cp]; 6244eb5378fSStefano Zampini for (i = 0; i < mr; i++) { 6254eb5378fSStefano Zampini const PetscInt gr = rmap[i]; 6264eb5378fSStefano Zampini for (j = ii[i]; j < ii[i+1]; j++) { 6274eb5378fSStefano Zampini coi[j] = gr; 6284eb5378fSStefano Zampini } 6294eb5378fSStefano Zampini } 6304eb5378fSStefano Zampini } 6314eb5378fSStefano Zampini /* columns coo */ 6324eb5378fSStefano Zampini if (!cmapt[cp]) { 6334eb5378fSStefano Zampini ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr); 6344eb5378fSStefano Zampini } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */ 6354eb5378fSStefano Zampini const PetscInt cs = P->cmap->rstart; 6364eb5378fSStefano Zampini for (j = 0; j < mm->nz; j++) { 6374eb5378fSStefano Zampini coj[j] = jj[j] + cs; 6384eb5378fSStefano Zampini } 6394eb5378fSStefano Zampini } else { /* offdiag */ 6404eb5378fSStefano Zampini const PetscInt *cmap = cmapa[cp]; 6414eb5378fSStefano Zampini for (j = 0; j < mm->nz; j++) { 6424eb5378fSStefano Zampini coj[j] = cmap[jj[j]]; 6434eb5378fSStefano Zampini } 6444eb5378fSStefano Zampini } 6454eb5378fSStefano Zampini ncoo += mm->nz; 6464eb5378fSStefano Zampini } 6474eb5378fSStefano Zampini if (glob) { 6484eb5378fSStefano Zampini ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr); 6494eb5378fSStefano Zampini } 6504eb5378fSStefano Zampini ierr = ISDestroy(&glob);CHKERRQ(ierr); 6514eb5378fSStefano Zampini if (P_oth_l2g) { 6524eb5378fSStefano Zampini ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr); 6534eb5378fSStefano Zampini } 6544eb5378fSStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr); 6554eb5378fSStefano Zampini 6564eb5378fSStefano Zampini if (hasoffproc) { /* offproc values insertion */ 6574eb5378fSStefano Zampini const PetscInt *sfdeg; 6584eb5378fSStefano Zampini const PetscInt n = P->cmap->n; 6594eb5378fSStefano Zampini PetscInt ncoo2,*coo_i2,*coo_j2; 6604eb5378fSStefano Zampini 6614eb5378fSStefano Zampini ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr); 6624eb5378fSStefano Zampini ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr); 6634eb5378fSStefano Zampini ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr); 6644eb5378fSStefano Zampini ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr); 6654eb5378fSStefano Zampini for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i]; 6664eb5378fSStefano Zampini ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr); 6674eb5378fSStefano Zampini ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 6684eb5378fSStefano Zampini ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr); 6694eb5378fSStefano Zampini ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 6704eb5378fSStefano Zampini ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr); 6714eb5378fSStefano Zampini ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 6724eb5378fSStefano Zampini ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr); 6734eb5378fSStefano Zampini coo_i = coo_i2; 6744eb5378fSStefano Zampini coo_j = coo_j2; 6754eb5378fSStefano Zampini ncoo = ncoo2; 6764eb5378fSStefano Zampini } 6774eb5378fSStefano Zampini 6784eb5378fSStefano Zampini /* preallocate with COO data */ 6794eb5378fSStefano Zampini ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr); 6804eb5378fSStefano Zampini ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr); 6814eb5378fSStefano Zampini ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr); 6824eb5378fSStefano Zampini PetscFunctionReturn(0); 6834eb5378fSStefano Zampini } 6844eb5378fSStefano Zampini 6854eb5378fSStefano Zampini static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat) 6864eb5378fSStefano Zampini { 6874eb5378fSStefano Zampini Mat_Product *product = mat->product; 6884eb5378fSStefano Zampini PetscErrorCode ierr; 6894eb5378fSStefano Zampini PetscBool Biscusp = PETSC_FALSE; 6904eb5378fSStefano Zampini 6914eb5378fSStefano Zampini PetscFunctionBegin; 6924eb5378fSStefano Zampini MatCheckProduct(mat,1); 6934eb5378fSStefano Zampini if (!product->B->boundtocpu) { 6944eb5378fSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr); 6954eb5378fSStefano Zampini } 6964eb5378fSStefano Zampini if (Biscusp) { 6974eb5378fSStefano Zampini switch (product->type) { 6984eb5378fSStefano Zampini case MATPRODUCT_AB: 6994eb5378fSStefano Zampini case MATPRODUCT_AtB: 7004eb5378fSStefano Zampini case MATPRODUCT_PtAP: 7014eb5378fSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE; 7024eb5378fSStefano Zampini break; 7034eb5378fSStefano Zampini default: 7044eb5378fSStefano Zampini break; 7054eb5378fSStefano Zampini } 7064eb5378fSStefano Zampini } 7074eb5378fSStefano Zampini if (!mat->ops->productsymbolic) { 7084eb5378fSStefano Zampini ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr); 7094eb5378fSStefano Zampini } 7104eb5378fSStefano Zampini PetscFunctionReturn(0); 7114eb5378fSStefano Zampini } 7124eb5378fSStefano Zampini 713*e589036eSStefano Zampini /*@ 714*e589036eSStefano Zampini MatAIJCUSPARSESetGenerateTranspose - Sets the flag to explicitly generate the transpose matrix before calling MatMultTranspose 715*e589036eSStefano Zampini 716*e589036eSStefano Zampini Not collective 717*e589036eSStefano Zampini 718*e589036eSStefano Zampini Input Parameters: 719*e589036eSStefano Zampini + A - Matrix of type SEQAIJCUSPARSE or MPIAIJCUSPARSE 720*e589036eSStefano Zampini - gen - the boolean flag 721*e589036eSStefano Zampini 722*e589036eSStefano Zampini Level: intermediate 723*e589036eSStefano Zampini 724*e589036eSStefano Zampini .seealso: MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE 725*e589036eSStefano Zampini @*/ 726*e589036eSStefano Zampini PetscErrorCode MatAIJCUSPARSESetGenerateTranspose(Mat A, PetscBool gen) 727*e589036eSStefano Zampini { 728*e589036eSStefano Zampini PetscErrorCode ierr; 729*e589036eSStefano Zampini PetscBool ismpiaij; 730*e589036eSStefano Zampini 731*e589036eSStefano Zampini PetscFunctionBegin; 732*e589036eSStefano Zampini PetscValidHeaderSpecific(A,MAT_CLASSID,1); 733*e589036eSStefano Zampini MatCheckPreallocated(A,1); 734*e589036eSStefano Zampini ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATMPIAIJ,&ismpiaij);CHKERRQ(ierr); 735*e589036eSStefano Zampini if (ismpiaij) { 736*e589036eSStefano Zampini Mat A_d,A_o; 737*e589036eSStefano Zampini 738*e589036eSStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&A_d,&A_o,NULL);CHKERRQ(ierr); 739*e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_d,gen);CHKERRQ(ierr); 740*e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A_o,gen);CHKERRQ(ierr); 741*e589036eSStefano Zampini } else { 742*e589036eSStefano Zampini ierr = MatSeqAIJCUSPARSESetGenerateTranspose(A,gen);CHKERRQ(ierr); 743*e589036eSStefano Zampini } 744*e589036eSStefano Zampini PetscFunctionReturn(0); 745*e589036eSStefano Zampini } 746*e589036eSStefano Zampini 7479ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 7489ae82921SPaul Mullowney { 7499ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 7509ae82921SPaul Mullowney PetscErrorCode ierr; 7519ae82921SPaul Mullowney PetscInt nt; 7529ae82921SPaul Mullowney 7539ae82921SPaul Mullowney PetscFunctionBegin; 7549ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 7559ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 7569ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 7574d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 7589ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 7599ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 7609ae82921SPaul Mullowney PetscFunctionReturn(0); 7619ae82921SPaul Mullowney } 762ca45077fSPaul Mullowney 7633fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 7643fa6b06aSMark Adams { 7653fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 7663fa6b06aSMark Adams PetscErrorCode ierr; 7673fa6b06aSMark Adams 7683fa6b06aSMark Adams PetscFunctionBegin; 7693fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 7703fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 7713fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 7723fa6b06aSMark Adams if (d_mat) { 7733fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 7743fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 7753fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 7763fa6b06aSMark Adams cudaError_t err; 7773fa6b06aSMark Adams PetscScalar *vals; 7783fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 7793fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 7803fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 7813fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 7823fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 7833fa6b06aSMark Adams } 7843fa6b06aSMark Adams } 7853fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 7863fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 7873fa6b06aSMark Adams 7883fa6b06aSMark Adams PetscFunctionReturn(0); 7893fa6b06aSMark Adams } 790fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 791fdc842d1SBarry Smith { 792fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 793fdc842d1SBarry Smith PetscErrorCode ierr; 794fdc842d1SBarry Smith PetscInt nt; 795fdc842d1SBarry Smith 796fdc842d1SBarry Smith PetscFunctionBegin; 797fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 798fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 799fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 8004d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 801fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 802fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 803fdc842d1SBarry Smith PetscFunctionReturn(0); 804fdc842d1SBarry Smith } 805fdc842d1SBarry Smith 806ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 807ca45077fSPaul Mullowney { 808ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 809ca45077fSPaul Mullowney PetscErrorCode ierr; 810ca45077fSPaul Mullowney PetscInt nt; 811ca45077fSPaul Mullowney 812ca45077fSPaul Mullowney PetscFunctionBegin; 813ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 814ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 8159b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 816ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 8179b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 8189b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 819ca45077fSPaul Mullowney PetscFunctionReturn(0); 820ca45077fSPaul Mullowney } 8219ae82921SPaul Mullowney 822e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 823ca45077fSPaul Mullowney { 824ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 825bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 826e057df02SPaul Mullowney 827ca45077fSPaul Mullowney PetscFunctionBegin; 828e057df02SPaul Mullowney switch (op) { 829e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 830e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 831045c96e1SPaul Mullowney break; 832e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 833e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 834045c96e1SPaul Mullowney break; 835e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 836e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 837e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 838045c96e1SPaul Mullowney break; 839e057df02SPaul Mullowney default: 840e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 841045c96e1SPaul Mullowney } 842ca45077fSPaul Mullowney PetscFunctionReturn(0); 843ca45077fSPaul Mullowney } 844e057df02SPaul Mullowney 8454416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 846e057df02SPaul Mullowney { 847e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 848e057df02SPaul Mullowney PetscErrorCode ierr; 849e057df02SPaul Mullowney PetscBool flg; 850a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 851a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 8525fd66863SKarl Rupp 853e057df02SPaul Mullowney PetscFunctionBegin; 854e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 855e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 856e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 857a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 858e057df02SPaul Mullowney if (flg) { 859e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 860e057df02SPaul Mullowney } 861e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 862a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 863e057df02SPaul Mullowney if (flg) { 864e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 865e057df02SPaul Mullowney } 866e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 867a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 868e057df02SPaul Mullowney if (flg) { 869e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 870e057df02SPaul Mullowney } 871e057df02SPaul Mullowney } 8720af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 873e057df02SPaul Mullowney PetscFunctionReturn(0); 874e057df02SPaul Mullowney } 875e057df02SPaul Mullowney 87634d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 87734d6c7a5SJose E. Roman { 87834d6c7a5SJose E. Roman PetscErrorCode ierr; 8793fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 8803fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 8813fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 88234d6c7a5SJose E. Roman PetscFunctionBegin; 88334d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 88434d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 88534d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 88634d6c7a5SJose E. Roman } 887a587d139SMark if (d_mat) { 8883fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 8893fa6b06aSMark Adams } 8903fa6b06aSMark Adams 89134d6c7a5SJose E. Roman PetscFunctionReturn(0); 89234d6c7a5SJose E. Roman } 89334d6c7a5SJose E. Roman 894bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 895bbf3fe20SPaul Mullowney { 896bbf3fe20SPaul Mullowney PetscErrorCode ierr; 8973fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8983fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 899b06137fdSPaul Mullowney cudaError_t err; 900b06137fdSPaul Mullowney cusparseStatus_t stat; 901bbf3fe20SPaul Mullowney 902bbf3fe20SPaul Mullowney PetscFunctionBegin; 903d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 9043fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 9053fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 9063fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 9073fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 9083fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 9093fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 9103fa6b06aSMark Adams if (jaca->compressedrow.use) { 9113fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 9123fa6b06aSMark Adams } 9133fa6b06aSMark Adams if (jacb->compressedrow.use) { 9143fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 9153fa6b06aSMark Adams } 9163fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 9173fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 9183fa6b06aSMark Adams } 919bbf3fe20SPaul Mullowney try { 9203fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 9213fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 92257d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 92317403302SKarl Rupp if (cusparseStruct->stream) { 924c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 92517403302SKarl Rupp } 9267e8381f9SStefano Zampini delete cusparseStruct->coo_p; 9277e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 928bbf3fe20SPaul Mullowney delete cusparseStruct; 929bbf3fe20SPaul Mullowney } catch(char *ex) { 930bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 931bbf3fe20SPaul Mullowney } 9323338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 933ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 9347e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 9357e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 9363338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 937bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 938bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 939bbf3fe20SPaul Mullowney } 940ca45077fSPaul Mullowney 9413338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 9429ae82921SPaul Mullowney { 9439ae82921SPaul Mullowney PetscErrorCode ierr; 944bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 945bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 946b06137fdSPaul Mullowney cusparseStatus_t stat; 9473338378cSStefano Zampini Mat A; 9489ae82921SPaul Mullowney 9499ae82921SPaul Mullowney PetscFunctionBegin; 9503338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 9513338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 9523338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 9533338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 9543338378cSStefano Zampini } 9553338378cSStefano Zampini A = *newmat; 9563338378cSStefano Zampini 95734136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 95834136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 95934136279SStefano Zampini 960bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 961d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 962d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 963d98d7c49SStefano Zampini if (a->lvec) { 964d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 965d98d7c49SStefano Zampini } 966d98d7c49SStefano Zampini 9673338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 968bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 9692205254eSKarl Rupp 970bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 971e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 972e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 9737e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 9747e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 97517403302SKarl Rupp cusparseStruct->stream = 0; 97657d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 9773fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 9783338378cSStefano Zampini } 9792205254eSKarl Rupp 98034d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 981bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 982fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 983bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 984bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 985bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 9863fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 9874eb5378fSStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE; 9882205254eSKarl Rupp 989bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 990ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 9913338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 992bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 9937e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 9947e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 9959ae82921SPaul Mullowney PetscFunctionReturn(0); 9969ae82921SPaul Mullowney } 9979ae82921SPaul Mullowney 9983338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 9993338378cSStefano Zampini { 10003338378cSStefano Zampini PetscErrorCode ierr; 10013338378cSStefano Zampini 10023338378cSStefano Zampini PetscFunctionBegin; 100305035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 10043338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 10053338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 10063338378cSStefano Zampini PetscFunctionReturn(0); 10073338378cSStefano Zampini } 10083338378cSStefano Zampini 10093f9c0db1SPaul Mullowney /*@ 10103f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 1011e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 10123f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 1013e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 1014e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 1015e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 10169ae82921SPaul Mullowney 1017d083f849SBarry Smith Collective 1018e057df02SPaul Mullowney 1019e057df02SPaul Mullowney Input Parameters: 1020e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 1021e057df02SPaul Mullowney . m - number of rows 1022e057df02SPaul Mullowney . n - number of columns 1023e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 1024e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 10250298fd71SBarry Smith (possibly different for each row) or NULL 1026e057df02SPaul Mullowney 1027e057df02SPaul Mullowney Output Parameter: 1028e057df02SPaul Mullowney . A - the matrix 1029e057df02SPaul Mullowney 1030e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 1031e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 1032e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 1033e057df02SPaul Mullowney 1034e057df02SPaul Mullowney Notes: 1035e057df02SPaul Mullowney If nnz is given then nz is ignored 1036e057df02SPaul Mullowney 1037e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 1038e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 1039e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 1040e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 1041e057df02SPaul Mullowney 1042e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 10430298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 1044e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 1045e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 1046e057df02SPaul Mullowney 1047e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 1048e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 1049e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 1050e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 1051e057df02SPaul Mullowney 1052e057df02SPaul Mullowney Level: intermediate 1053e057df02SPaul Mullowney 1054e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 1055e057df02SPaul Mullowney @*/ 10569ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 10579ae82921SPaul Mullowney { 10589ae82921SPaul Mullowney PetscErrorCode ierr; 10599ae82921SPaul Mullowney PetscMPIInt size; 10609ae82921SPaul Mullowney 10619ae82921SPaul Mullowney PetscFunctionBegin; 10629ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 10639ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 1064ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 10659ae82921SPaul Mullowney if (size > 1) { 10669ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 10679ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 10689ae82921SPaul Mullowney } else { 10699ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 10709ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 10719ae82921SPaul Mullowney } 10729ae82921SPaul Mullowney PetscFunctionReturn(0); 10739ae82921SPaul Mullowney } 10749ae82921SPaul Mullowney 10753ca39a21SBarry Smith /*MC 1076e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 1077e057df02SPaul Mullowney 10782692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 10792692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 10802692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 10819ae82921SPaul Mullowney 10829ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 10839ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 10849ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 10859ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 10869ae82921SPaul Mullowney the above preallocation routines for simplicity. 10879ae82921SPaul Mullowney 10889ae82921SPaul Mullowney Options Database Keys: 1089e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 10908468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 10918468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 10928468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 10939ae82921SPaul Mullowney 10949ae82921SPaul Mullowney Level: beginner 10959ae82921SPaul Mullowney 10968468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 10978468deeeSKarl Rupp M 10989ae82921SPaul Mullowney M*/ 10993fa6b06aSMark Adams 11003fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 11013fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 11023fa6b06aSMark Adams { 11039db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE) 11049db3cbf9SStefano Zampini SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 11059db3cbf9SStefano Zampini #else 11063fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 11073fa6b06aSMark Adams PetscMPIInt size,rank; 11083fa6b06aSMark Adams MPI_Comm comm; 11093fa6b06aSMark Adams PetscErrorCode ierr; 11103fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 11113fa6b06aSMark Adams PetscScalar *aa,*ba; 11123fa6b06aSMark Adams 11133fa6b06aSMark Adams PetscFunctionBegin; 11143fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 1115ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 1116ffc4695bSBarry Smith ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr); 11173fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 11183fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 11193fa6b06aSMark Adams if (size == 1) { 11203fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 11213fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 11223fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 11233fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 11243fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 11253fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 11266b78a27eSPierre Jolivet } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 11273fa6b06aSMark Adams } else { 11283fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 11293fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 11303fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 11313fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 11323fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 11333fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 11343fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 11353fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 11363fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 11373fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 11383fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 11393fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 11403fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 11413fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 11426b78a27eSPierre Jolivet } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 11433fa6b06aSMark Adams } 11443fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 11453fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 11463fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 11473fa6b06aSMark Adams } else { 11483fa6b06aSMark Adams *B = NULL; 11493fa6b06aSMark Adams PetscFunctionReturn(0); 11503fa6b06aSMark Adams } 11513fa6b06aSMark Adams // act like MatSetValues because not called on host 11523fa6b06aSMark Adams if (A->assembled) { 11533fa6b06aSMark Adams if (A->was_assembled) { 11543fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 11553fa6b06aSMark Adams } 11563fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 11573fa6b06aSMark Adams } else { 11583fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 11593fa6b06aSMark Adams } 11603fa6b06aSMark Adams if (!*p_d_mat) { 11613fa6b06aSMark Adams cudaError_t err; 11623fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 11633fa6b06aSMark Adams Mat_SeqAIJ *jaca; 11643fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 11653fa6b06aSMark Adams // create and copy 11663fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 11673fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 11683fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 11693fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 11703fa6b06aSMark Adams if (size == 1) { 11713fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 11723fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 11733fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 11743fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 11753fa6b06aSMark Adams h_mat.offdiag.a = NULL; 11763fa6b06aSMark Adams } else { 11773fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 11783fa6b06aSMark Adams Mat_SeqAIJ *jacb; 11793fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 11803fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 11813fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 11823fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 11833fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 11843fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 11853fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 11863fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 11873fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 11883fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 11893fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 11903fa6b06aSMark Adams { 11913fa6b06aSMark Adams PetscInt ii; 11923fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 11933fa6b06aSMark Adams } 11943fa6b06aSMark Adams if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 11953fa6b06aSMark Adams // allocate B copy data 11963fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 11973fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 11983fa6b06aSMark Adams nnz = jacb->i[n]; 11993fa6b06aSMark Adams 12003fa6b06aSMark Adams if (jacb->compressedrow.use) { 12013fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 12023fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 12036b78a27eSPierre Jolivet } else h_mat.offdiag.i = bi; 12043fa6b06aSMark Adams h_mat.offdiag.j = bj; 12053fa6b06aSMark Adams h_mat.offdiag.a = ba; 12063fa6b06aSMark Adams 12073fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 12083fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 12093fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 12103fa6b06aSMark Adams h_mat.offdiag.n = n; 12113fa6b06aSMark Adams } 12123fa6b06aSMark Adams // allocate A copy data 12133fa6b06aSMark Adams nnz = jaca->i[n]; 12143fa6b06aSMark Adams h_mat.diag.n = n; 12153fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 1216ffc4695bSBarry Smith ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr); 12173fa6b06aSMark Adams if (jaca->compressedrow.use) { 12183fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 12193fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 12203fa6b06aSMark Adams } else { 12213fa6b06aSMark Adams h_mat.diag.i = ai; 12223fa6b06aSMark Adams } 12233fa6b06aSMark Adams h_mat.diag.j = aj; 12243fa6b06aSMark Adams h_mat.diag.a = aa; 12253fa6b06aSMark Adams // copy pointers and metdata to device 12263fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 12273fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 12283fa6b06aSMark Adams } else { 12293fa6b06aSMark Adams *B = *p_d_mat; 12303fa6b06aSMark Adams } 12313fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 12323fa6b06aSMark Adams PetscFunctionReturn(0); 12339db3cbf9SStefano Zampini #endif 12343fa6b06aSMark Adams } 1235