1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 99ae82921SPaul Mullowney 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 209ae82921SPaul Mullowney if (d_nnz) { 219ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 229ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney } 259ae82921SPaul Mullowney if (o_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (!B->preallocated) { 31bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 329ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 353bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 369ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 37b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 389ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 393bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 409ae82921SPaul Mullowney } 41*d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 42*d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 43*d98d7c49SStefano Zampini if (b->lvec) { 44*d98d7c49SStefano Zampini ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr); 45*d98d7c49SStefano Zampini } 469ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 479ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 48e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 49e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 50b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 51b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 52b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 53b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 542205254eSKarl Rupp 559ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 569ae82921SPaul Mullowney PetscFunctionReturn(0); 579ae82921SPaul Mullowney } 58e057df02SPaul Mullowney 599ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 609ae82921SPaul Mullowney { 619ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 629ae82921SPaul Mullowney PetscErrorCode ierr; 639ae82921SPaul Mullowney PetscInt nt; 649ae82921SPaul Mullowney 659ae82921SPaul Mullowney PetscFunctionBegin; 669ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 679ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 68959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 699ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 704d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 719ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 729ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 739ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 749ae82921SPaul Mullowney PetscFunctionReturn(0); 759ae82921SPaul Mullowney } 76ca45077fSPaul Mullowney 773fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 783fa6b06aSMark Adams { 793fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 803fa6b06aSMark Adams PetscErrorCode ierr; 813fa6b06aSMark Adams 823fa6b06aSMark Adams PetscFunctionBegin; 833fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 843fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 853fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 863fa6b06aSMark Adams if (d_mat) { 873fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 883fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 893fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 903fa6b06aSMark Adams cudaError_t err; 913fa6b06aSMark Adams PetscScalar *vals; 923fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 933fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 943fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 953fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 963fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 973fa6b06aSMark Adams } 983fa6b06aSMark Adams } 993fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 1003fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 1013fa6b06aSMark Adams 1023fa6b06aSMark Adams PetscFunctionReturn(0); 1033fa6b06aSMark Adams } 104fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 105fdc842d1SBarry Smith { 106fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 107fdc842d1SBarry Smith PetscErrorCode ierr; 108fdc842d1SBarry Smith PetscInt nt; 109fdc842d1SBarry Smith 110fdc842d1SBarry Smith PetscFunctionBegin; 111fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 112fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 113fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 114fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1154d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 116fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 117fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 118fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 119fdc842d1SBarry Smith PetscFunctionReturn(0); 120fdc842d1SBarry Smith } 121fdc842d1SBarry Smith 122ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 123ca45077fSPaul Mullowney { 124ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 125ca45077fSPaul Mullowney PetscErrorCode ierr; 126ca45077fSPaul Mullowney PetscInt nt; 127ca45077fSPaul Mullowney 128ca45077fSPaul Mullowney PetscFunctionBegin; 129ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 130ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 131a3fdcf43SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 1329b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 133ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1349b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1359b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 136ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 137ca45077fSPaul Mullowney PetscFunctionReturn(0); 138ca45077fSPaul Mullowney } 1399ae82921SPaul Mullowney 140e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 141ca45077fSPaul Mullowney { 142ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 143bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 144e057df02SPaul Mullowney 145ca45077fSPaul Mullowney PetscFunctionBegin; 146e057df02SPaul Mullowney switch (op) { 147e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 148e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 149045c96e1SPaul Mullowney break; 150e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 151e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 152045c96e1SPaul Mullowney break; 153e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 154e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 155e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 156045c96e1SPaul Mullowney break; 157e057df02SPaul Mullowney default: 158e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 159045c96e1SPaul Mullowney } 160ca45077fSPaul Mullowney PetscFunctionReturn(0); 161ca45077fSPaul Mullowney } 162e057df02SPaul Mullowney 1634416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 164e057df02SPaul Mullowney { 165e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 166e057df02SPaul Mullowney PetscErrorCode ierr; 167e057df02SPaul Mullowney PetscBool flg; 168a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 169a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1705fd66863SKarl Rupp 171e057df02SPaul Mullowney PetscFunctionBegin; 172e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 173e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 174e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 175a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 176e057df02SPaul Mullowney if (flg) { 177e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 178e057df02SPaul Mullowney } 179e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 180a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 181e057df02SPaul Mullowney if (flg) { 182e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 183e057df02SPaul Mullowney } 184e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 185a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 186e057df02SPaul Mullowney if (flg) { 187e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 188e057df02SPaul Mullowney } 189e057df02SPaul Mullowney } 1900af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 191e057df02SPaul Mullowney PetscFunctionReturn(0); 192e057df02SPaul Mullowney } 193e057df02SPaul Mullowney 19434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19534d6c7a5SJose E. Roman { 19634d6c7a5SJose E. Roman PetscErrorCode ierr; 1973fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1983fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 1993fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 2003fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 20134d6c7a5SJose E. Roman PetscFunctionBegin; 2023fa6b06aSMark Adams if (d_mat) { 2033fa6b06aSMark Adams cudaError_t err; 2043fa6b06aSMark Adams err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2053fa6b06aSMark Adams } 20634d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 20734d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 20834d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20934d6c7a5SJose E. Roman } 2103fa6b06aSMark Adams if (nnz_state > A->nonzerostate) { 2113fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 2123fa6b06aSMark Adams } 2133fa6b06aSMark Adams 21434d6c7a5SJose E. Roman PetscFunctionReturn(0); 21534d6c7a5SJose E. Roman } 21634d6c7a5SJose E. Roman 217bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 218bbf3fe20SPaul Mullowney { 219bbf3fe20SPaul Mullowney PetscErrorCode ierr; 2203fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 2213fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 222b06137fdSPaul Mullowney cudaError_t err; 223b06137fdSPaul Mullowney cusparseStatus_t stat; 224bbf3fe20SPaul Mullowney 225bbf3fe20SPaul Mullowney PetscFunctionBegin; 226*d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 2273fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 2283fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 2293fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 2303fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 2313fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2323fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2333fa6b06aSMark Adams if (jaca->compressedrow.use) { 2343fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2353fa6b06aSMark Adams } 2363fa6b06aSMark Adams if (jacb->compressedrow.use) { 2373fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 2383fa6b06aSMark Adams } 2393fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 2403fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 2413fa6b06aSMark Adams } 242bbf3fe20SPaul Mullowney try { 2433fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 2443fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 24557d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 24617403302SKarl Rupp if (cusparseStruct->stream) { 247c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 24817403302SKarl Rupp } 249bbf3fe20SPaul Mullowney delete cusparseStruct; 250bbf3fe20SPaul Mullowney } catch(char *ex) { 251bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 252bbf3fe20SPaul Mullowney } 2533338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 2543338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 255bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 256bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 257bbf3fe20SPaul Mullowney } 258ca45077fSPaul Mullowney 2593338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 2609ae82921SPaul Mullowney { 2619ae82921SPaul Mullowney PetscErrorCode ierr; 262bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 263bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 264b06137fdSPaul Mullowney cusparseStatus_t stat; 2653338378cSStefano Zampini Mat A; 2669ae82921SPaul Mullowney 2679ae82921SPaul Mullowney PetscFunctionBegin; 2683338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 2693338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2703338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 2713338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2723338378cSStefano Zampini } 2733338378cSStefano Zampini A = *newmat; 2743338378cSStefano Zampini 27534136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 27634136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 27734136279SStefano Zampini 278bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 279*d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 280*d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 281*d98d7c49SStefano Zampini if (a->lvec) { 282*d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 283*d98d7c49SStefano Zampini } 284*d98d7c49SStefano Zampini 2853338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 286bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2872205254eSKarl Rupp 288bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 289e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 290e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 29117403302SKarl Rupp cusparseStruct->stream = 0; 29257d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 2933fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 2943338378cSStefano Zampini } 2952205254eSKarl Rupp 29634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 297bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 298fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 299bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 300bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 301bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 3023fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 3032205254eSKarl Rupp 304bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3053338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 306bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 3079ae82921SPaul Mullowney PetscFunctionReturn(0); 3089ae82921SPaul Mullowney } 3099ae82921SPaul Mullowney 3103338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 3113338378cSStefano Zampini { 3123338378cSStefano Zampini PetscErrorCode ierr; 3133338378cSStefano Zampini 3143338378cSStefano Zampini PetscFunctionBegin; 31505035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 3163338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 3173338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 3183338378cSStefano Zampini PetscFunctionReturn(0); 3193338378cSStefano Zampini } 3203338378cSStefano Zampini 3213f9c0db1SPaul Mullowney /*@ 3223f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 323e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3243f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 325e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 326e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 327e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 3289ae82921SPaul Mullowney 329d083f849SBarry Smith Collective 330e057df02SPaul Mullowney 331e057df02SPaul Mullowney Input Parameters: 332e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 333e057df02SPaul Mullowney . m - number of rows 334e057df02SPaul Mullowney . n - number of columns 335e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 336e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 3370298fd71SBarry Smith (possibly different for each row) or NULL 338e057df02SPaul Mullowney 339e057df02SPaul Mullowney Output Parameter: 340e057df02SPaul Mullowney . A - the matrix 341e057df02SPaul Mullowney 342e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 343e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 344e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 345e057df02SPaul Mullowney 346e057df02SPaul Mullowney Notes: 347e057df02SPaul Mullowney If nnz is given then nz is ignored 348e057df02SPaul Mullowney 349e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 350e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 351e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 352e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 353e057df02SPaul Mullowney 354e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3550298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 356e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 357e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 358e057df02SPaul Mullowney 359e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 360e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 361e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 362e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 363e057df02SPaul Mullowney 364e057df02SPaul Mullowney Level: intermediate 365e057df02SPaul Mullowney 366e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 367e057df02SPaul Mullowney @*/ 3689ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3699ae82921SPaul Mullowney { 3709ae82921SPaul Mullowney PetscErrorCode ierr; 3719ae82921SPaul Mullowney PetscMPIInt size; 3729ae82921SPaul Mullowney 3739ae82921SPaul Mullowney PetscFunctionBegin; 3749ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3759ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3769ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3779ae82921SPaul Mullowney if (size > 1) { 3789ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3799ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3809ae82921SPaul Mullowney } else { 3819ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3829ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3839ae82921SPaul Mullowney } 3849ae82921SPaul Mullowney PetscFunctionReturn(0); 3859ae82921SPaul Mullowney } 3869ae82921SPaul Mullowney 3873ca39a21SBarry Smith /*MC 388e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 389e057df02SPaul Mullowney 3902692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3912692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3922692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3939ae82921SPaul Mullowney 3949ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3959ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3969ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3979ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3989ae82921SPaul Mullowney the above preallocation routines for simplicity. 3999ae82921SPaul Mullowney 4009ae82921SPaul Mullowney Options Database Keys: 401e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 4028468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4038468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4048468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 4059ae82921SPaul Mullowney 4069ae82921SPaul Mullowney Level: beginner 4079ae82921SPaul Mullowney 4088468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 4098468deeeSKarl Rupp M 4109ae82921SPaul Mullowney M*/ 4113fa6b06aSMark Adams 4123fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 4133fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 4143fa6b06aSMark Adams { 4159db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE) 4169db3cbf9SStefano Zampini SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 4179db3cbf9SStefano Zampini #else 4183fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 4193fa6b06aSMark Adams PetscMPIInt size,rank; 4203fa6b06aSMark Adams MPI_Comm comm; 4213fa6b06aSMark Adams PetscErrorCode ierr; 4223fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 4233fa6b06aSMark Adams PetscScalar *aa,*ba; 4243fa6b06aSMark Adams 4253fa6b06aSMark Adams PetscFunctionBegin; 4263fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 4273fa6b06aSMark Adams ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 4283fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 4293fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 4303fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 4313fa6b06aSMark Adams if (size == 1) { 4323fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 4333fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 4343fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 4353fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 4363fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 4373fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 4383fa6b06aSMark Adams } else { 4393fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 4403fa6b06aSMark Adams } 4413fa6b06aSMark Adams } else { 4423fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4433fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 4443fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 4453fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 4463fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 4473fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 4483fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 4493fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 4503fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 4513fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 4523fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 4533fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 4543fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 4553fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 4563fa6b06aSMark Adams if (rank==-1) { 4573fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 4583fa6b06aSMark Adams std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 4593fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 4603fa6b06aSMark Adams std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 4613fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->values->size(); i++) 4623fa6b06aSMark Adams std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 4633fa6b06aSMark Adams } 4643fa6b06aSMark Adams } else { 4653fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 4663fa6b06aSMark Adams } 4673fa6b06aSMark Adams } 4683fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4693fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4703fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 4713fa6b06aSMark Adams } else { 4723fa6b06aSMark Adams *B = NULL; 4733fa6b06aSMark Adams PetscFunctionReturn(0); 4743fa6b06aSMark Adams } 4753fa6b06aSMark Adams // act like MatSetValues because not called on host 4763fa6b06aSMark Adams if (A->assembled) { 4773fa6b06aSMark Adams if (A->was_assembled) { 4783fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 4793fa6b06aSMark Adams } 4803fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 4813fa6b06aSMark Adams } else { 4823fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 4833fa6b06aSMark Adams } 4843fa6b06aSMark Adams if (!*p_d_mat) { 4853fa6b06aSMark Adams cudaError_t err; 4863fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 4873fa6b06aSMark Adams Mat_SeqAIJ *jaca; 4883fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 4893fa6b06aSMark Adams // create and copy 4903fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 4913fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 4923fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 4933fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 4943fa6b06aSMark Adams if (size == 1) { 4953fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 4963fa6b06aSMark Adams h_mat.nonzerostate = A->nonzerostate; 4973fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 4983fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 4993fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 5003fa6b06aSMark Adams h_mat.offdiag.a = NULL; 5013fa6b06aSMark Adams h_mat.seq = PETSC_TRUE; 5023fa6b06aSMark Adams } else { 5033fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 5043fa6b06aSMark Adams Mat_SeqAIJ *jacb; 5053fa6b06aSMark Adams h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 5063fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 5073fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 5083fa6b06aSMark Adams h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 5093fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 5103fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 5113fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 5123fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 5133fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 5143fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 5153fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 5163fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 5173fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 5183fa6b06aSMark Adams { 5193fa6b06aSMark Adams PetscInt ii; 5203fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 5213fa6b06aSMark Adams } 5223fa6b06aSMark Adams if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 5233fa6b06aSMark Adams // allocate B copy data 5243fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 5253fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 5263fa6b06aSMark Adams nnz = jacb->i[n]; 5273fa6b06aSMark Adams 5283fa6b06aSMark Adams if (jacb->compressedrow.use) { 5293fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 5303fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5313fa6b06aSMark Adams } else { 5323fa6b06aSMark Adams h_mat.offdiag.i = bi; 5333fa6b06aSMark Adams } 5343fa6b06aSMark Adams h_mat.offdiag.j = bj; 5353fa6b06aSMark Adams h_mat.offdiag.a = ba; 5363fa6b06aSMark Adams 5373fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 5383fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5393fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 5403fa6b06aSMark Adams h_mat.offdiag.n = n; 5413fa6b06aSMark Adams } 5423fa6b06aSMark Adams // allocate A copy data 5433fa6b06aSMark Adams nnz = jaca->i[n]; 5443fa6b06aSMark Adams h_mat.diag.n = n; 5453fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 5463fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 5473fa6b06aSMark Adams if (jaca->compressedrow.use) { 5483fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 5493fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5503fa6b06aSMark Adams } else { 5513fa6b06aSMark Adams h_mat.diag.i = ai; 5523fa6b06aSMark Adams } 5533fa6b06aSMark Adams h_mat.diag.j = aj; 5543fa6b06aSMark Adams h_mat.diag.a = aa; 5553fa6b06aSMark Adams // copy pointers and metdata to device 5563fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5573fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 5583fa6b06aSMark Adams } else { 5593fa6b06aSMark Adams *B = *p_d_mat; 5603fa6b06aSMark Adams } 5613fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 5623fa6b06aSMark Adams PetscFunctionReturn(0); 5639db3cbf9SStefano Zampini #endif 5643fa6b06aSMark Adams } 565