1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 2*53800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 79ae82921SPaul Mullowney 89ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 99ae82921SPaul Mullowney { 10bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 11bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 129ae82921SPaul Mullowney PetscErrorCode ierr; 139ae82921SPaul Mullowney PetscInt i; 149ae82921SPaul Mullowney 159ae82921SPaul Mullowney PetscFunctionBegin; 169ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 179ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 189ae82921SPaul Mullowney if (d_nnz) { 199ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 209ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 219ae82921SPaul Mullowney } 229ae82921SPaul Mullowney } 239ae82921SPaul Mullowney if (o_nnz) { 249ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 259ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 269ae82921SPaul Mullowney } 279ae82921SPaul Mullowney } 289ae82921SPaul Mullowney if (!B->preallocated) { 29bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 309ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 31fdc842d1SBarry Smith ierr = MatPinToCPU(b->A,B->pinnedtocpu);CHKERRQ(ierr); 329ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 339ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 343bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 36fdc842d1SBarry Smith ierr = MatPinToCPU(b->B,B->pinnedtocpu);CHKERRQ(ierr); 379ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 389ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 393bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 409ae82921SPaul Mullowney } 419ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 429ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 43e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 44e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 45b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 46b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 492205254eSKarl Rupp 509ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 519ae82921SPaul Mullowney PetscFunctionReturn(0); 529ae82921SPaul Mullowney } 53e057df02SPaul Mullowney 549ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 559ae82921SPaul Mullowney { 56fdc842d1SBarry Smith /* 57fdc842d1SBarry Smith This multiplication sequence is different sequence 58e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 59e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 60e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 61e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 62e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 63e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 64e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 65e057df02SPaul Mullowney against race conditions. 66fdc842d1SBarry Smith */ 679ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 689ae82921SPaul Mullowney PetscErrorCode ierr; 699ae82921SPaul Mullowney PetscInt nt; 709ae82921SPaul Mullowney 719ae82921SPaul Mullowney PetscFunctionBegin; 729ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 739ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 74959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 759ae82921SPaul Mullowney ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 769ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 779ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 789ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 799ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 809ae82921SPaul Mullowney PetscFunctionReturn(0); 819ae82921SPaul Mullowney } 82ca45077fSPaul Mullowney 83fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 84fdc842d1SBarry Smith { 85fdc842d1SBarry Smith /* 86fdc842d1SBarry Smith This multiplication sequence is different sequence 87fdc842d1SBarry Smith than the CPU version. In particular, the diagonal block 88fdc842d1SBarry Smith multiplication kernel is launched in one stream. Then, 89fdc842d1SBarry Smith in a separate stream, the data transfers from DeviceToHost 90fdc842d1SBarry Smith (with MPI messaging in between), then HostToDevice are 91fdc842d1SBarry Smith launched. Once the data transfer stream is synchronized, 92fdc842d1SBarry Smith to ensure messaging is complete, the MatMultAdd kernel 93fdc842d1SBarry Smith is launched in the original (MatMult) stream to protect 94fdc842d1SBarry Smith against race conditions. 95fdc842d1SBarry Smith */ 96fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 97fdc842d1SBarry Smith PetscErrorCode ierr; 98fdc842d1SBarry Smith PetscInt nt; 99fdc842d1SBarry Smith 100fdc842d1SBarry Smith PetscFunctionBegin; 101fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 102fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 103fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 104fdc842d1SBarry Smith ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 105fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 106fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 107fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 108fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 109fdc842d1SBarry Smith PetscFunctionReturn(0); 110fdc842d1SBarry Smith } 111fdc842d1SBarry Smith 112ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 113ca45077fSPaul Mullowney { 114e057df02SPaul Mullowney /* This multiplication sequence is different sequence 115e057df02SPaul Mullowney than the CPU version. In particular, the diagonal block 116e057df02SPaul Mullowney multiplication kernel is launched in one stream. Then, 117e057df02SPaul Mullowney in a separate stream, the data transfers from DeviceToHost 118e057df02SPaul Mullowney (with MPI messaging in between), then HostToDevice are 119e057df02SPaul Mullowney launched. Once the data transfer stream is synchronized, 120e057df02SPaul Mullowney to ensure messaging is complete, the MatMultAdd kernel 121e057df02SPaul Mullowney is launched in the original (MatMult) stream to protect 122e057df02SPaul Mullowney against race conditions. 123e057df02SPaul Mullowney 124e057df02SPaul Mullowney This sequence should only be called for GPU computation. */ 125ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 126ca45077fSPaul Mullowney PetscErrorCode ierr; 127ca45077fSPaul Mullowney PetscInt nt; 128ca45077fSPaul Mullowney 129ca45077fSPaul Mullowney PetscFunctionBegin; 130ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 131ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 132959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 1339b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 134ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1359b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1369b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 137ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 138ca45077fSPaul Mullowney PetscFunctionReturn(0); 139ca45077fSPaul Mullowney } 1409ae82921SPaul Mullowney 141e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 142ca45077fSPaul Mullowney { 143ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 144bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 145e057df02SPaul Mullowney 146ca45077fSPaul Mullowney PetscFunctionBegin; 147e057df02SPaul Mullowney switch (op) { 148e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 149e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 150045c96e1SPaul Mullowney break; 151e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 152e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 153045c96e1SPaul Mullowney break; 154e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 155e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 156e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 157045c96e1SPaul Mullowney break; 158e057df02SPaul Mullowney default: 159e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 160045c96e1SPaul Mullowney } 161ca45077fSPaul Mullowney PetscFunctionReturn(0); 162ca45077fSPaul Mullowney } 163e057df02SPaul Mullowney 1644416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 165e057df02SPaul Mullowney { 166e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 167e057df02SPaul Mullowney PetscErrorCode ierr; 168e057df02SPaul Mullowney PetscBool flg; 169a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 170a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1715fd66863SKarl Rupp 172e057df02SPaul Mullowney PetscFunctionBegin; 173e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 174e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 175e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 176a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 177e057df02SPaul Mullowney if (flg) { 178e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 179e057df02SPaul Mullowney } 180e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 181a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 182e057df02SPaul Mullowney if (flg) { 183e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 184e057df02SPaul Mullowney } 185e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 186a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 187e057df02SPaul Mullowney if (flg) { 188e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 189e057df02SPaul Mullowney } 190e057df02SPaul Mullowney } 1910af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 192e057df02SPaul Mullowney PetscFunctionReturn(0); 193e057df02SPaul Mullowney } 194e057df02SPaul Mullowney 19534d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19634d6c7a5SJose E. Roman { 19734d6c7a5SJose E. Roman PetscErrorCode ierr; 19834d6c7a5SJose E. Roman Mat_MPIAIJ *mpiaij; 19934d6c7a5SJose E. Roman 20034d6c7a5SJose E. Roman PetscFunctionBegin; 20134d6c7a5SJose E. Roman mpiaij = (Mat_MPIAIJ*)A->data; 20234d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 20334d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 20434d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20534d6c7a5SJose E. Roman } 20634d6c7a5SJose E. Roman PetscFunctionReturn(0); 20734d6c7a5SJose E. Roman } 20834d6c7a5SJose E. Roman 209bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 210bbf3fe20SPaul Mullowney { 211bbf3fe20SPaul Mullowney PetscErrorCode ierr; 212bbf3fe20SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 213bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 214b06137fdSPaul Mullowney cudaError_t err; 215b06137fdSPaul Mullowney cusparseStatus_t stat; 216bbf3fe20SPaul Mullowney 217bbf3fe20SPaul Mullowney PetscFunctionBegin; 218bbf3fe20SPaul Mullowney try { 219b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->A);CHKERRQ(ierr); 220b06137fdSPaul Mullowney ierr = MatCUSPARSEClearHandle(a->B);CHKERRQ(ierr); 221c41cb2e2SAlejandro Lamas Daviña stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUDA(stat); 222c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 223bbf3fe20SPaul Mullowney delete cusparseStruct; 224bbf3fe20SPaul Mullowney } catch(char *ex) { 225bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 226bbf3fe20SPaul Mullowney } 227bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 228bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 229bbf3fe20SPaul Mullowney } 230ca45077fSPaul Mullowney 2318cc058d9SJed Brown PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 2329ae82921SPaul Mullowney { 2339ae82921SPaul Mullowney PetscErrorCode ierr; 234bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 235bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct; 236b06137fdSPaul Mullowney cudaError_t err; 237b06137fdSPaul Mullowney cusparseStatus_t stat; 2389ae82921SPaul Mullowney 2399ae82921SPaul Mullowney PetscFunctionBegin; 240bbf3fe20SPaul Mullowney ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 241bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 24234136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 24334136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 24434136279SStefano Zampini 245bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 246bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2472205254eSKarl Rupp 248bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 249e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 250e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 251c41cb2e2SAlejandro Lamas Daviña stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUDA(stat); 252c41cb2e2SAlejandro Lamas Daviña err = cudaStreamCreate(&(cusparseStruct->stream));CHKERRCUDA(err); 2532205254eSKarl Rupp 25434d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 255bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 256fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 257bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 258bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 259bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2602205254eSKarl Rupp 261bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 262bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2639ae82921SPaul Mullowney PetscFunctionReturn(0); 2649ae82921SPaul Mullowney } 2659ae82921SPaul Mullowney 2663f9c0db1SPaul Mullowney /*@ 2673f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 268e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 2693f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 270e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 271e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 272e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 2739ae82921SPaul Mullowney 274d083f849SBarry Smith Collective 275e057df02SPaul Mullowney 276e057df02SPaul Mullowney Input Parameters: 277e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 278e057df02SPaul Mullowney . m - number of rows 279e057df02SPaul Mullowney . n - number of columns 280e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 281e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 2820298fd71SBarry Smith (possibly different for each row) or NULL 283e057df02SPaul Mullowney 284e057df02SPaul Mullowney Output Parameter: 285e057df02SPaul Mullowney . A - the matrix 286e057df02SPaul Mullowney 287e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 288e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 289e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 290e057df02SPaul Mullowney 291e057df02SPaul Mullowney Notes: 292e057df02SPaul Mullowney If nnz is given then nz is ignored 293e057df02SPaul Mullowney 294e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 295e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 296e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 297e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 298e057df02SPaul Mullowney 299e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3000298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 301e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 302e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 303e057df02SPaul Mullowney 304e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 305e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 306e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 307e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 308e057df02SPaul Mullowney 309e057df02SPaul Mullowney Level: intermediate 310e057df02SPaul Mullowney 311e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 312e057df02SPaul Mullowney @*/ 3139ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3149ae82921SPaul Mullowney { 3159ae82921SPaul Mullowney PetscErrorCode ierr; 3169ae82921SPaul Mullowney PetscMPIInt size; 3179ae82921SPaul Mullowney 3189ae82921SPaul Mullowney PetscFunctionBegin; 3199ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3209ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3219ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3229ae82921SPaul Mullowney if (size > 1) { 3239ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3249ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3259ae82921SPaul Mullowney } else { 3269ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3279ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3289ae82921SPaul Mullowney } 3299ae82921SPaul Mullowney PetscFunctionReturn(0); 3309ae82921SPaul Mullowney } 3319ae82921SPaul Mullowney 3323ca39a21SBarry Smith /*MC 333e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 334e057df02SPaul Mullowney 3352692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3362692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3372692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3389ae82921SPaul Mullowney 3399ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3409ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3419ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3429ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3439ae82921SPaul Mullowney the above preallocation routines for simplicity. 3449ae82921SPaul Mullowney 3459ae82921SPaul Mullowney Options Database Keys: 346e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3478468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3488468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3498468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3509ae82921SPaul Mullowney 3519ae82921SPaul Mullowney Level: beginner 3529ae82921SPaul Mullowney 3538468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3548468deeeSKarl Rupp M 3559ae82921SPaul Mullowney M*/ 356