111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscsf.h> 342550becSJunchao Zhang #include <petsc/private/sfimpl.h> 48c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/mpi/mpiaij.h> 542550becSJunchao Zhang #include <../src/mat/impls/aij/mpi/kokkos/mpiaijkok.hpp> 6076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 711d22bbfSJunchao Zhang 88c3ff71bSJunchao Zhang PetscErrorCode MatAssemblyEnd_MPIAIJKokkos(Mat A,MatAssemblyType mode) 98c3ff71bSJunchao Zhang { 105519a089SJose E. Roman Mat_SeqAIJKokkos *aijkok; 118c3ff71bSJunchao Zhang 128c3ff71bSJunchao Zhang PetscFunctionBegin; 139566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A,mode)); 145519a089SJose E. Roman aijkok = static_cast<Mat_SeqAIJKokkos*>(((Mat_MPIAIJ*)A->data)->A->spptr); /* Access spptr after MatAssemblyEnd_MPIAIJ(), which might have deleted old spptr */ 15a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 16a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 17a587d139SMark } 18a587d139SMark 198c3ff71bSJunchao Zhang PetscFunctionReturn(0); 208c3ff71bSJunchao Zhang } 218c3ff71bSJunchao Zhang 228c3ff71bSJunchao Zhang PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJKokkos(Mat mat,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 238c3ff71bSJunchao Zhang { 248c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 258c3ff71bSJunchao Zhang 268c3ff71bSJunchao Zhang PetscFunctionBegin; 279566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(mat->rmap)); 289566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(mat->cmap)); 296a29ce69SStefano Zampini #if defined(PETSC_USE_DEBUG) 308c3ff71bSJunchao Zhang if (d_nnz) { 316a29ce69SStefano Zampini PetscInt i; 328c3ff71bSJunchao Zhang for (i=0; i<mat->rmap->n; i++) { 3308401ef6SPierre Jolivet PetscCheck(d_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 348c3ff71bSJunchao Zhang } 358c3ff71bSJunchao Zhang } 368c3ff71bSJunchao Zhang if (o_nnz) { 376a29ce69SStefano Zampini PetscInt i; 388c3ff71bSJunchao Zhang for (i=0; i<mat->rmap->n; i++) { 3908401ef6SPierre Jolivet PetscCheck(o_nnz[i] >= 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 408c3ff71bSJunchao Zhang } 418c3ff71bSJunchao Zhang } 426a29ce69SStefano Zampini #endif 436a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 449566063dSJacob Faibussowitsch PetscCall(PetscTableDestroy(&mpiaij->colmap)); 456a29ce69SStefano Zampini #else 469566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap)); 476a29ce69SStefano Zampini #endif 489566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray)); 499566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec)); 509566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx)); 516a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 529566063dSJacob Faibussowitsch PetscCall(MatDestroy(&mpiaij->B)); 536a29ce69SStefano Zampini 546a29ce69SStefano Zampini if (!mpiaij->A) { 559566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&mpiaij->A)); 569566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mpiaij->A,mat->rmap->n,mat->cmap->n,mat->rmap->n,mat->cmap->n)); 579566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->A)); 586a29ce69SStefano Zampini } 596a29ce69SStefano Zampini if (!mpiaij->B) { 606a29ce69SStefano Zampini PetscMPIInt size; 619566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 629566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF,&mpiaij->B)); 639566063dSJacob Faibussowitsch PetscCall(MatSetSizes(mpiaij->B,mat->rmap->n,size > 1 ? mat->cmap->N : 0,mat->rmap->n,size > 1 ? mat->cmap->N : 0)); 649566063dSJacob Faibussowitsch PetscCall(PetscLogObjectParent((PetscObject)mat,(PetscObject)mpiaij->B)); 658c3ff71bSJunchao Zhang } 669566063dSJacob Faibussowitsch PetscCall(MatSetType(mpiaij->A,MATSEQAIJKOKKOS)); 679566063dSJacob Faibussowitsch PetscCall(MatSetType(mpiaij->B,MATSEQAIJKOKKOS)); 689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(mpiaij->A,d_nz,d_nnz)); 699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(mpiaij->B,o_nz,o_nnz)); 708c3ff71bSJunchao Zhang mat->preallocated = PETSC_TRUE; 718c3ff71bSJunchao Zhang PetscFunctionReturn(0); 728c3ff71bSJunchao Zhang } 738c3ff71bSJunchao Zhang 748c3ff71bSJunchao Zhang PetscErrorCode MatMult_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 758c3ff71bSJunchao Zhang { 768c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 778c3ff71bSJunchao Zhang PetscInt nt; 788c3ff71bSJunchao Zhang 798c3ff71bSJunchao Zhang PetscFunctionBegin; 809566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(xx,&nt)); 8108401ef6SPierre Jolivet PetscCheck(nt == mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 829566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 839566063dSJacob Faibussowitsch PetscCall((*mpiaij->A->ops->mult)(mpiaij->A,xx,yy)); 849566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 859566063dSJacob Faibussowitsch PetscCall((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,yy,yy)); 868c3ff71bSJunchao Zhang PetscFunctionReturn(0); 878c3ff71bSJunchao Zhang } 888c3ff71bSJunchao Zhang 898c3ff71bSJunchao Zhang PetscErrorCode MatMultAdd_MPIAIJKokkos(Mat mat,Vec xx,Vec yy,Vec zz) 908c3ff71bSJunchao Zhang { 918c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 928c3ff71bSJunchao Zhang PetscInt nt; 938c3ff71bSJunchao Zhang 948c3ff71bSJunchao Zhang PetscFunctionBegin; 959566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(xx,&nt)); 9608401ef6SPierre Jolivet PetscCheck(nt == mat->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->cmap->n,nt); 979566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 989566063dSJacob Faibussowitsch PetscCall((*mpiaij->A->ops->multadd)(mpiaij->A,xx,yy,zz)); 999566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(mpiaij->Mvctx,xx,mpiaij->lvec,INSERT_VALUES,SCATTER_FORWARD)); 1009566063dSJacob Faibussowitsch PetscCall((*mpiaij->B->ops->multadd)(mpiaij->B,mpiaij->lvec,zz,zz)); 1018c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1028c3ff71bSJunchao Zhang } 1038c3ff71bSJunchao Zhang 1048c3ff71bSJunchao Zhang PetscErrorCode MatMultTranspose_MPIAIJKokkos(Mat mat,Vec xx,Vec yy) 1058c3ff71bSJunchao Zhang { 1068c3ff71bSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 1078c3ff71bSJunchao Zhang PetscInt nt; 1088c3ff71bSJunchao Zhang 1098c3ff71bSJunchao Zhang PetscFunctionBegin; 1109566063dSJacob Faibussowitsch PetscCall(VecGetLocalSize(xx,&nt)); 11108401ef6SPierre Jolivet PetscCheck(nt == mat->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of mat (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",mat->rmap->n,nt); 1129566063dSJacob Faibussowitsch PetscCall((*mpiaij->B->ops->multtranspose)(mpiaij->B,xx,mpiaij->lvec)); 1139566063dSJacob Faibussowitsch PetscCall((*mpiaij->A->ops->multtranspose)(mpiaij->A,xx,yy)); 1149566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 1159566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(mpiaij->Mvctx,mpiaij->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 1168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 1178c3ff71bSJunchao Zhang } 1188c3ff71bSJunchao Zhang 119076ba34aSJunchao Zhang /* Merge the "A, B" matrices of mat into a matrix C. mat's type is MPIAIJKOKKOS. C's type is MATSEQAIJKOKKOS. 120076ba34aSJunchao Zhang A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n). 121076ba34aSJunchao Zhang C still uses local column ids. Their corresponding global column ids are returned in glob. 122076ba34aSJunchao Zhang */ 123076ba34aSJunchao Zhang PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJKokkos(Mat mat,MatReuse reuse,IS *glob,Mat *C) 124076ba34aSJunchao Zhang { 125076ba34aSJunchao Zhang Mat Ad,Ao; 126076ba34aSJunchao Zhang const PetscInt *cmap; 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 1299566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(mat,&Ad,&Ao,&cmap)); 1309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosMergeMats(Ad,Ao,reuse,C)); 131076ba34aSJunchao Zhang if (glob) { 132076ba34aSJunchao Zhang PetscInt cst, i, dn, on, *gidx; 1339566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad,NULL,&dn)); 1349566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao,NULL,&on)); 1359566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(mat,&cst,NULL)); 1369566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn+on,&gidx)); 137076ba34aSJunchao Zhang for (i=0; i<dn; i++) gidx[i] = cst + i; 138076ba34aSJunchao Zhang for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 1399566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 140076ba34aSJunchao Zhang } 141076ba34aSJunchao Zhang PetscFunctionReturn(0); 142076ba34aSJunchao Zhang } 143076ba34aSJunchao Zhang 144076ba34aSJunchao Zhang /* Structs used in matrix product C=AB, C=A^tB and C=B^tAB */ 145076ba34aSJunchao Zhang struct MatMatStruct { 146076ba34aSJunchao Zhang MatRowMapKokkosView Cdstart; /* Used to split sequential matrix into petsc's A, B format */ 147076ba34aSJunchao Zhang PetscSF sf; /* SF to send/recv matrix entries */ 148076ba34aSJunchao Zhang MatScalarKokkosView abuf; /* buf of mat values in send/recv */ 149076ba34aSJunchao Zhang Mat C1,C2,B_local; 150076ba34aSJunchao Zhang KokkosCsrMatrix C1_global,C2_global,C_global; 151076ba34aSJunchao Zhang KernelHandle kh; 152076ba34aSJunchao Zhang MatMatStruct() { 153076ba34aSJunchao Zhang C1 = C2 = B_local = NULL; 154076ba34aSJunchao Zhang sf = NULL; 155076ba34aSJunchao Zhang } 156076ba34aSJunchao Zhang 157076ba34aSJunchao Zhang ~MatMatStruct() { 158076ba34aSJunchao Zhang MatDestroy(&C1); 159076ba34aSJunchao Zhang MatDestroy(&C2); 160076ba34aSJunchao Zhang MatDestroy(&B_local); 161076ba34aSJunchao Zhang PetscSFDestroy(&sf); 162076ba34aSJunchao Zhang kh.destroy_spadd_handle(); 163076ba34aSJunchao Zhang } 164076ba34aSJunchao Zhang }; 165076ba34aSJunchao Zhang 166076ba34aSJunchao Zhang struct MatMatStruct_AB : public MatMatStruct { 167076ba34aSJunchao Zhang MatColIdxKokkosView rows; 168076ba34aSJunchao Zhang MatRowMapKokkosView rowoffset; 169076ba34aSJunchao Zhang Mat B_other,C_petsc; /* SEQAIJKOKKOS matrices. TODO: have a better var name than C_petsc */ 170076ba34aSJunchao Zhang 171076ba34aSJunchao Zhang MatMatStruct_AB() : B_other(NULL),C_petsc(NULL){} 172076ba34aSJunchao Zhang ~MatMatStruct_AB() { 173076ba34aSJunchao Zhang MatDestroy(&B_other); 174076ba34aSJunchao Zhang MatDestroy(&C_petsc); 175076ba34aSJunchao Zhang } 176076ba34aSJunchao Zhang }; 177076ba34aSJunchao Zhang 178076ba34aSJunchao Zhang struct MatMatStruct_AtB : public MatMatStruct { 179076ba34aSJunchao Zhang MatRowMapKokkosView srcrowoffset,dstrowoffset; 180076ba34aSJunchao Zhang }; 181076ba34aSJunchao Zhang 182076ba34aSJunchao Zhang struct MatProductData_MPIAIJKokkos 183076ba34aSJunchao Zhang { 184076ba34aSJunchao Zhang MatMatStruct_AB *mmAB; 185076ba34aSJunchao Zhang MatMatStruct_AtB *mmAtB; 186076ba34aSJunchao Zhang PetscBool reusesym; 187076ba34aSJunchao Zhang 188076ba34aSJunchao Zhang MatProductData_MPIAIJKokkos(): mmAB(NULL),mmAtB(NULL),reusesym(PETSC_FALSE){} 189076ba34aSJunchao Zhang ~MatProductData_MPIAIJKokkos() { 190076ba34aSJunchao Zhang delete mmAB; 191076ba34aSJunchao Zhang delete mmAtB; 192076ba34aSJunchao Zhang } 193076ba34aSJunchao Zhang }; 194076ba34aSJunchao Zhang 195076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_MPIAIJKokkos(void *data) 196076ba34aSJunchao Zhang { 197076ba34aSJunchao Zhang PetscFunctionBegin; 1989566063dSJacob Faibussowitsch PetscCallCXX(delete static_cast<MatProductData_MPIAIJKokkos*>(data)); 199076ba34aSJunchao Zhang PetscFunctionReturn(0); 200076ba34aSJunchao Zhang } 201076ba34aSJunchao Zhang 202076ba34aSJunchao Zhang /* MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds - Get a KokkosCsrMatrix from a MATSEQAIJKOKKOS matrix 203076ba34aSJunchao Zhang 204076ba34aSJunchao Zhang Input Parameters: 205076ba34aSJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 206076ba34aSJunchao Zhang . N - new column size for the returned Kokkos matrix 207076ba34aSJunchao Zhang - l2g - a map that maps old col ids to new col ids 208076ba34aSJunchao Zhang 209076ba34aSJunchao Zhang Output Parameters: 210076ba34aSJunchao Zhang . csrmat - the Kokkos matrix, which has the same row size as A, shares a, i but not j with A. 211076ba34aSJunchao Zhang */ 212076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(Mat A,PetscInt N,const ConstMatColIdxKokkosView& l2g,KokkosCsrMatrix& csrmat) 213076ba34aSJunchao Zhang { 214076ba34aSJunchao Zhang KokkosCsrMatrix& orig = static_cast<Mat_SeqAIJKokkos*>(A->spptr)->csrmat; 215076ba34aSJunchao Zhang MatColIdxKokkosView jg("jg",orig.nnz()); /* New j array for csrmat */ 216076ba34aSJunchao Zhang 217076ba34aSJunchao Zhang PetscFunctionBegin; 2189566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::parallel_for(orig.nnz(), KOKKOS_LAMBDA(const PetscInt i) {jg(i) = l2g(orig.graph.entries(i));})); 2199566063dSJacob Faibussowitsch PetscCallCXX(csrmat = KokkosCsrMatrix("csrmat",orig.numRows(),N,orig.nnz(),orig.values,orig.graph.row_map,jg)); 220076ba34aSJunchao Zhang PetscFunctionReturn(0); 221076ba34aSJunchao Zhang } 222076ba34aSJunchao Zhang 223076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices - Set the diag and offdiag matrices of a MATMPIAIJKOKKOS matrix. 224076ba34aSJunchao Zhang It is similar to MatCreateMPIAIJWithSplitArrays. 225076ba34aSJunchao Zhang 226076ba34aSJunchao Zhang Input Parameters: 227076ba34aSJunchao Zhang + mat - the MATMPIAIJKOKKOS matrix, which should have its type and layout set, but should not have its diag, offdiag matrices set 228076ba34aSJunchao Zhang . A - the diag matrix using local col ids 229076ba34aSJunchao Zhang - B - the offdiag matrix using global col ids 230076ba34aSJunchao Zhang 231076ba34aSJunchao Zhang Output Parameters: 232076ba34aSJunchao Zhang . mat - the updated MATMPIAIJKOKKOS matrix 233076ba34aSJunchao Zhang */ 234076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(Mat mat,Mat A,Mat B) 235076ba34aSJunchao Zhang { 236076ba34aSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 237076ba34aSJunchao Zhang PetscInt m,n,M,N,Am,An,Bm,Bn; 238076ba34aSJunchao Zhang Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 239076ba34aSJunchao Zhang 240076ba34aSJunchao Zhang PetscFunctionBegin; 2419566063dSJacob Faibussowitsch PetscCall(MatGetSize(mat,&M,&N)); 2429566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(mat,&m,&n)); 2439566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(A,&Am,&An)); 2449566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(B,&Bm,&Bn)); 245076ba34aSJunchao Zhang 246aed4548fSBarry Smith PetscCheck(m == Am && m == Bm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of rows do not match"); 24708401ef6SPierre Jolivet PetscCheck(n == An,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local number of columns do not match"); 24808401ef6SPierre Jolivet PetscCheck(N == Bn,PETSC_COMM_SELF,PETSC_ERR_PLIB,"global number of columns do not match"); 24908401ef6SPierre Jolivet PetscCheck(!mpiaij->A && !mpiaij->B,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A, B of the MPIAIJ matrix are not empty"); 250076ba34aSJunchao Zhang mpiaij->A = A; 251076ba34aSJunchao Zhang mpiaij->B = B; 252076ba34aSJunchao Zhang 253076ba34aSJunchao Zhang mat->preallocated = PETSC_TRUE; 254076ba34aSJunchao Zhang mat->nooffprocentries = PETSC_TRUE; /* See MatAssemblyBegin_MPIAIJ. In effect, making MatAssemblyBegin a nop */ 255076ba34aSJunchao Zhang 2569566063dSJacob Faibussowitsch PetscCall(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE)); 2579566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY)); 258076ba34aSJunchao Zhang /* MatAssemblyEnd is critical here. It sets mat->offloadmask according to A and B's, and 259076ba34aSJunchao Zhang also gets mpiaij->B compacted, with its col ids and size reduced 260076ba34aSJunchao Zhang */ 2619566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY)); 2629566063dSJacob Faibussowitsch PetscCall(MatSetOption(mat,MAT_NO_OFF_PROC_ENTRIES,PETSC_FALSE)); 2639566063dSJacob Faibussowitsch PetscCall(MatSetOption(mat,MAT_NEW_NONZERO_LOCATION_ERR,PETSC_TRUE)); 264076ba34aSJunchao Zhang 265076ba34aSJunchao Zhang /* Update bkok with new local col ids (stored on host) and size */ 266076ba34aSJunchao Zhang bkok->j_dual.modify_host(); 267076ba34aSJunchao Zhang bkok->j_dual.sync_device(); 268076ba34aSJunchao Zhang bkok->SetColSize(mpiaij->B->cmap->n); 269076ba34aSJunchao Zhang PetscFunctionReturn(0); 270076ba34aSJunchao Zhang } 271076ba34aSJunchao Zhang 272076ba34aSJunchao Zhang /* MatSeqAIJKokkosBcast - Bcast rows of a SEQAIJKOKKOS matrice (B) to form a SEQAIJKOKKOS matrix (C). 273076ba34aSJunchao Zhang 274076ba34aSJunchao Zhang It is essentially the MPIAIJKOKKOS counterpart of MatGetBrowsOfAoCols_MPIAIJ, but supports device and uses PetscSF. 275076ba34aSJunchao Zhang In the given ownerSF, leaves correspond to rows in C, and roots correspond to rows in B. Roots may connect to multiple leaves. 276076ba34aSJunchao Zhang Suppose C's j-th row is connected to a root identified by PetscSFNode (k,i), it means we will bcast the i-th row of B on rank k 277076ba34aSJunchao Zhang to j-th row of C. ownerSF's leaves must be contiguous (in other words, as if ilocal=NULL was used to set its graph). 278076ba34aSJunchao Zhang 279076ba34aSJunchao Zhang Collective on comm of ownerSF 280076ba34aSJunchao Zhang 281076ba34aSJunchao Zhang Input Parameters: 282076ba34aSJunchao Zhang + B - the SEQAIJKOKKOS matrix, using local col ids 283076ba34aSJunchao Zhang . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 284076ba34aSJunchao Zhang . N - global col ids are in range of [0,N). N Must be the same across ranks (nonsignificant in MAT_REUSE_MATRIX) 285076ba34aSJunchao Zhang . l2g - a map mapping B's local col ids to global ones (nonsignificant in MAT_REUSE_MATRIX) 286076ba34aSJunchao Zhang . ownerSF - the ownership SF (nonsignificant in MAT_REUSE_MATRIX) 287076ba34aSJunchao Zhang 288076ba34aSJunchao Zhang Input/Output Parameters (out when resue = MAT_INITIAL_MATRIX, inout when reuse = MAT_REUSE_MATRIX) 289076ba34aSJunchao Zhang + bcastSF - the SF used to bcast rows of B. This plain SF does buffer (abuf) to buffer (Ca) send/recv. In this SF, vertices are nonzeros. 290076ba34aSJunchao Zhang . abuf - buffer for sending matrix values 291076ba34aSJunchao Zhang . rows - array containing indices of (local) rows that this rank needs to bcast to others. Each receiver rank has a chunk in rows[]. 292076ba34aSJunchao Zhang Values in rows[] might have repeats, which simply indicates a row will be bcast'ed to multiple neighbors. 293076ba34aSJunchao Zhang . rowoffset - For each row in rows[], it will be copied to rowoffset[] at abuf[] 294076ba34aSJunchao Zhang - C - the SEQAIJKOKKOS matrix made of the bcast'ed rows, using local col ids. 295076ba34aSJunchao Zhang */ 296076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosBcast(Mat B,MatReuse reuse,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 297076ba34aSJunchao Zhang PetscSF& bcastSF,MatScalarKokkosView& abuf,MatColIdxKokkosView& rows, 298076ba34aSJunchao Zhang MatRowMapKokkosView& rowoffset,Mat& C) 299076ba34aSJunchao Zhang { 300076ba34aSJunchao Zhang Mat_SeqAIJKokkos *bkok,*ckok; 301076ba34aSJunchao Zhang 302076ba34aSJunchao Zhang PetscFunctionBegin; 3039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); /* Make sure B->spptr is accessible */ 304076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 305076ba34aSJunchao Zhang 306076ba34aSJunchao Zhang if (reuse == MAT_REUSE_MATRIX) { 307076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 308076ba34aSJunchao Zhang 309076ba34aSJunchao Zhang const auto& Ba = bkok->a_dual.view_device(); 310076ba34aSJunchao Zhang const auto& Bi = bkok->i_dual.view_device(); 311076ba34aSJunchao Zhang const auto& Ca = ckok->a_dual.view_device(); 312076ba34aSJunchao Zhang 313076ba34aSJunchao Zhang /* Copy Ba to abuf */ 314076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 315076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 316076ba34aSJunchao Zhang PetscInt r = rows(i); 317076ba34aSJunchao Zhang PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 318076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 319076ba34aSJunchao Zhang abuf(base+k) = Ba(Bi(r)+k); 320076ba34aSJunchao Zhang }); 321076ba34aSJunchao Zhang }); 322076ba34aSJunchao Zhang 323076ba34aSJunchao Zhang /* Send abuf to Ca through bcastSF and then mark C is updated on device */ 3249566063dSJacob Faibussowitsch PetscCall(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE)); /* TODO: get memtype for abuf */ 3259566063dSJacob Faibussowitsch PetscCall(PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.data(),MPI_REPLACE)); 326076ba34aSJunchao Zhang ckok->a_dual.modify_device(); 327076ba34aSJunchao Zhang } else if (reuse == MAT_INITIAL_MATRIX) { 328076ba34aSJunchao Zhang MPI_Comm comm; 329076ba34aSJunchao Zhang PetscMPIInt tag; 330076ba34aSJunchao Zhang PetscInt k,Cm,Cn,Cnnz,*Ci_h,nroots,nleaves; 331076ba34aSJunchao Zhang 3329566063dSJacob Faibussowitsch PetscCallMPI(PetscObjectGetComm((PetscObject)ownerSF,&comm)); 3339566063dSJacob Faibussowitsch PetscCall(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL)); 334076ba34aSJunchao Zhang Cm = nleaves; /* row size of C */ 335076ba34aSJunchao Zhang Cn = N; /* col size of C, which initially uses global ids, so we can safely set its col size as N */ 336076ba34aSJunchao Zhang 337076ba34aSJunchao Zhang /* Get row lens (nz) of B's rows for later fast query */ 338076ba34aSJunchao Zhang PetscInt *Browlens; 339076ba34aSJunchao Zhang const PetscInt *tmp = bkok->i_host_data(); 3409566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nroots,&Browlens)); 341076ba34aSJunchao Zhang for (k=0; k<nroots; k++) Browlens[k] = tmp[k+1]-tmp[k]; 342076ba34aSJunchao Zhang 343076ba34aSJunchao Zhang /* By ownerSF, each proc gets lens of rows of C */ 344076ba34aSJunchao Zhang MatRowMapKokkosDualView Ci("i",Cm+1); /* C's rowmap */ 345076ba34aSJunchao Zhang Ci_h = Ci.view_host().data(); 346076ba34aSJunchao Zhang Ci_h[0] = 0; 3479566063dSJacob Faibussowitsch PetscCall(PetscSFBcastWithMemTypeBegin(ownerSF,MPIU_INT,PETSC_MEMTYPE_HOST,Browlens,PETSC_MEMTYPE_HOST,&Ci_h[1],MPI_REPLACE)); 3489566063dSJacob Faibussowitsch PetscCall(PetscSFBcastEnd(ownerSF,MPIU_INT,Browlens,&Ci_h[1],MPI_REPLACE)); 349076ba34aSJunchao Zhang for (k=1; k<Cm+1; k++) Ci_h[k] += Ci_h[k-1]; /* Convert lens to CSR */ 350076ba34aSJunchao Zhang Cnnz = Ci_h[Cm]; 351076ba34aSJunchao Zhang Ci.modify_host(); 352076ba34aSJunchao Zhang Ci.sync_device(); 353076ba34aSJunchao Zhang 354076ba34aSJunchao Zhang /* With the newly known Cnnz, we are able to allocate (j, a) for C on host & device */ 355076ba34aSJunchao Zhang MatColIdxKokkosDualView Cj("j",Cnnz); 356076ba34aSJunchao Zhang MatScalarKokkosDualView Ca("a",Cnnz); 357076ba34aSJunchao Zhang 358076ba34aSJunchao Zhang /* Now build the bcastSF to fill Ca, Cj. This plain SF only does (contiguous) buffer to buffer send/recv */ 359076ba34aSJunchao Zhang const PetscMPIInt *iranks,*ranks; 360076ba34aSJunchao Zhang const PetscInt *ioffset,*irootloc,*roffset; 361076ba34aSJunchao Zhang PetscInt i,j,niranks,nranks,*sdisp,*rdisp,*rowptr; 362076ba34aSJunchao Zhang MPI_Request *reqs; 363076ba34aSJunchao Zhang 3649566063dSJacob Faibussowitsch PetscCall(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&irootloc)); /* irootloc[] contains indices of rows I need to send to each receiver */ 3659566063dSJacob Faibussowitsch PetscCall(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* recv info */ 366076ba34aSJunchao Zhang 367076ba34aSJunchao Zhang /* figure out offsets at the send buffer, to build the SF 368076ba34aSJunchao Zhang sdisp[] - stores offsets of nonzeros (in abuf or jbuf, see later) I need to send, per receiver. 369076ba34aSJunchao Zhang rowptr[] - stores offsets for data of each row in abuf 370076ba34aSJunchao Zhang 371076ba34aSJunchao Zhang rdisp[] - to receive sdisp[] 372076ba34aSJunchao Zhang */ 3739566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(niranks+1,&sdisp,nranks,&rdisp,niranks+nranks,&reqs)); 374076ba34aSJunchao Zhang MatRowMapKokkosViewHost rowptr_h("rowptr_h",ioffset[niranks]+1); /* Let Kokkos do the allocation, so that we can do an easy mirror later */ 375076ba34aSJunchao Zhang rowptr = rowptr_h.data(); 376076ba34aSJunchao Zhang 377076ba34aSJunchao Zhang sdisp[0] = 0; 378076ba34aSJunchao Zhang rowptr[0] = 0; 379076ba34aSJunchao Zhang for (i=0; i<niranks; i++) { /* for each receiver */ 380076ba34aSJunchao Zhang PetscInt len, nz = 0; 381076ba34aSJunchao Zhang for (j=ioffset[i]; j<ioffset[i+1]; j++) { /* for each row to this receiver */ 382076ba34aSJunchao Zhang len = Browlens[irootloc[j]]; 383076ba34aSJunchao Zhang rowptr[j+1] = rowptr[j] + len; 384076ba34aSJunchao Zhang nz += len; 385076ba34aSJunchao Zhang } 386076ba34aSJunchao Zhang sdisp[i+1] = sdisp[i] + nz; 387076ba34aSJunchao Zhang } 3889566063dSJacob Faibussowitsch PetscCallMPI(PetscCommGetNewTag(comm,&tag)); 3899566063dSJacob Faibussowitsch for (i=0; i<nranks; i++) PetscCallMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i])); 3909566063dSJacob Faibussowitsch for (i=0; i<niranks; i++) PetscCallMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i])); 3919566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 392076ba34aSJunchao Zhang 393076ba34aSJunchao Zhang PetscInt nleaves2 = Cnnz; /* leaves are the nonzeros I will receive */ 394076ba34aSJunchao Zhang PetscInt nroots2 = sdisp[niranks]; /* roots are the nonzeros (in abuf) I will send */ 395076ba34aSJunchao Zhang PetscSFNode *iremote; 3969566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nleaves2,&iremote)); 397076ba34aSJunchao Zhang for (i=0; i<nranks; i++) { /* for each sender */ 398076ba34aSJunchao Zhang k = 0; 399076ba34aSJunchao Zhang for (j=Ci_h[roffset[i]]; j<Ci_h[roffset[i+1]]; j++) { 400076ba34aSJunchao Zhang iremote[j].rank = ranks[i]; 401076ba34aSJunchao Zhang iremote[j].index = rdisp[i] + k; 402076ba34aSJunchao Zhang k++; 403076ba34aSJunchao Zhang } 404076ba34aSJunchao Zhang } 405076ba34aSJunchao Zhang /* TODO: we should extend PetscSF APIs for this buffer-to-buffer send/recv */ 4069566063dSJacob Faibussowitsch PetscCall(PetscSFCreate(comm,&bcastSF)); 4079566063dSJacob Faibussowitsch PetscCall(PetscSFSetGraph(bcastSF,nroots2,nleaves2,NULL/*ilocal*/,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 408076ba34aSJunchao Zhang 409076ba34aSJunchao Zhang /* Extract selected rows of B, and copy their (a, j) into abuf[] and jbuf[], with j converted 410076ba34aSJunchao Zhang from local to global. Then use bcastSF to fill Ca, Cj. 411076ba34aSJunchao Zhang */ 412076ba34aSJunchao Zhang ConstMatColIdxKokkosViewHost rows_h(irootloc,ioffset[niranks]); /* irootloc[] stores indices of rows I need to send */ 413076ba34aSJunchao Zhang MatColIdxKokkosView rows("rows",ioffset[niranks]); 414076ba34aSJunchao Zhang Kokkos::deep_copy(rows,rows_h); /* Use deep copy since irootoc is managed by PetscSF and we want 'rows' to be standalone */ 415076ba34aSJunchao Zhang 416076ba34aSJunchao Zhang rowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rowptr_h); /* If no device, rowoffset will be an alias to rowptr_h */ 417076ba34aSJunchao Zhang 418076ba34aSJunchao Zhang MatColIdxKokkosView jbuf("jbuf",sdisp[niranks]); /* send buf for (global) col ids */ 419076ba34aSJunchao Zhang abuf = MatScalarKokkosView("abuf",sdisp[niranks]); /* send buf for mat values */ 420076ba34aSJunchao Zhang 421076ba34aSJunchao Zhang const auto& Ba = bkok->a_dual.view_device(); 422076ba34aSJunchao Zhang const auto& Bi = bkok->i_dual.view_device(); 423076ba34aSJunchao Zhang const auto& Bj = bkok->j_dual.view_device(); 424076ba34aSJunchao Zhang 425076ba34aSJunchao Zhang /* Copy Ba, Bj to abuf, jbuf with change col ids from local to global */ 426076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(rows.extent(0), Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 427076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* rows[i] is r-th row of B */ 428076ba34aSJunchao Zhang PetscInt r = rows(i); 429076ba34aSJunchao Zhang PetscInt base = rowoffset(i); /* Copy r-th row of B to this offset in abuf[] */ 430076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t,Bi(r+1)-Bi(r)),[&](PetscInt k) { 431076ba34aSJunchao Zhang abuf(base+k) = Ba(Bi(r)+k); 432076ba34aSJunchao Zhang jbuf(base+k) = l2g(Bj(Bi(r)+k)); 433076ba34aSJunchao Zhang }); 434076ba34aSJunchao Zhang }); 435076ba34aSJunchao Zhang 436076ba34aSJunchao Zhang /* Send abuf & jbuf to fill Ca, Cj */ 4379566063dSJacob Faibussowitsch PetscCall(PetscSFBcastBegin(bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE)); 4389566063dSJacob Faibussowitsch PetscCall(PetscSFBcastBegin(bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE)); 4399566063dSJacob Faibussowitsch PetscCall(PetscSFBcastEnd (bcastSF,MPIU_INT, jbuf.data(),Cj.view_device().data(),MPI_REPLACE)); 4409566063dSJacob Faibussowitsch PetscCall(PetscSFBcastEnd (bcastSF,MPIU_SCALAR,abuf.data(),Ca.view_device().data(),MPI_REPLACE)); 441076ba34aSJunchao Zhang Cj.modify_device(); /* Mark Cj, Ca modified on device, but only sync Cj since we might not need Ca on host at all */ 442076ba34aSJunchao Zhang Cj.sync_host(); 443076ba34aSJunchao Zhang Ca.modify_device(); 444076ba34aSJunchao Zhang 445076ba34aSJunchao Zhang /* Construct C with Ca, Ci, Cj */ 446076ba34aSJunchao Zhang auto ckok = new Mat_SeqAIJKokkos(Cm,Cn,Cnnz,Ci,Cj,Ca); 4479566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,&C)); 4489566063dSJacob Faibussowitsch PetscCall(PetscFree3(sdisp,rdisp,reqs)); 4499566063dSJacob Faibussowitsch PetscCall(PetscFree(Browlens)); 45098921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 451076ba34aSJunchao Zhang PetscFunctionReturn(0); 452076ba34aSJunchao Zhang } 453076ba34aSJunchao Zhang 454076ba34aSJunchao Zhang /* MatSeqAIJKokkosReduce - Reduce rows of a SEQAIJKOKKOS matrix (A) to form a Kokkos Csr matrix (C) 455076ba34aSJunchao Zhang 456076ba34aSJunchao Zhang It is the reverse of MatSeqAIJKokkosBcast in some sense. 457076ba34aSJunchao Zhang 458076ba34aSJunchao Zhang Think each row of A as a leaf, then the given ownerSF specifies roots of the leaves. Roots may connect to multiple leaves. 459076ba34aSJunchao Zhang In this routine, we reduce (i.e., concatenate) leaves (rows) at their roots to form potentially longer rows in C. Such rows might 460076ba34aSJunchao Zhang contain repeats, which does not matter since they will be summed up by other routines. C's row size will be nroots of ownerSF. 461076ba34aSJunchao Zhang 462076ba34aSJunchao Zhang Input Parameters: 463076ba34aSJunchao Zhang + A - the SEQAIJKOKKOS matrix to be reduced 464076ba34aSJunchao Zhang . reuse - either MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX 465076ba34aSJunchao Zhang . local - true if A uses local col ids; false if A is already in global col ids. 466076ba34aSJunchao Zhang . N - if local, N is A's global col size 467076ba34aSJunchao Zhang . l2g - if local, a map mapping A's local col ids to global ones, which are in range of [0,N). 468076ba34aSJunchao Zhang - ownerSF - the SF specifies ownership (root) of rows in A 469076ba34aSJunchao Zhang 470076ba34aSJunchao Zhang Output Parameters: 471076ba34aSJunchao Zhang + reduceSF - the SF to reduce A's rows to contiguous buffers at the receiver side 472076ba34aSJunchao Zhang . abuf - a contiguous buffer to receive A's rows sent to this proc. Suppose there are 'nrows' such rows. 473076ba34aSJunchao Zhang . srcrowoffset - offset array of size nrows+1. Each entry is the corresponding row's offset in abuf[]. srcrowoffset[i+1]-srcrowoffset[i] is row i's len. 474076ba34aSJunchao Zhang . dstrowoffset - offset array of size nrows. Each entry is the corresponding row's offset in Ca[], i.e., C's 'a' array. Row i, i+1 in abuf[] may go to 475076ba34aSJunchao Zhang unrelated places in Ca, so dstrowoffset is not in CSR-like format as srcrowoffset. 476076ba34aSJunchao Zhang - C - the matrix made up by rows sent to me from other ranks, using global col ids 477076ba34aSJunchao Zhang 478076ba34aSJunchao Zhang TODO: we can even have MatSeqAIJKokkosReduceBegin/End to provide oppertunity for callers to overlap comp./comm. when reuse = MAT_REUSE_MATRIX. 479076ba34aSJunchao Zhang */ 480076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosReduce(Mat A,MatReuse reuse,PetscBool local,PetscInt N,const ConstMatColIdxKokkosView& l2g,PetscSF ownerSF, 481076ba34aSJunchao Zhang PetscSF& reduceSF,MatScalarKokkosView& abuf, 482076ba34aSJunchao Zhang MatRowMapKokkosView& srcrowoffset,MatRowMapKokkosView& dstrowoffset, 483076ba34aSJunchao Zhang KokkosCsrMatrix& C) 484076ba34aSJunchao Zhang { 485076ba34aSJunchao Zhang PetscInt i,r,Am,An,Annz,Cnnz,nrows; 486076ba34aSJunchao Zhang const PetscInt *Ai; 487076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok; 488076ba34aSJunchao Zhang 489076ba34aSJunchao Zhang PetscFunctionBegin; 4909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); /* So that A's latest data is on device */ 4919566063dSJacob Faibussowitsch PetscCall(MatGetSize(A,&Am,&An)); 492076ba34aSJunchao Zhang Ai = static_cast<Mat_SeqAIJ*>(A->data)->i; 493076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 494076ba34aSJunchao Zhang Annz = Ai[Am]; 495076ba34aSJunchao Zhang 496076ba34aSJunchao Zhang if (reuse == MAT_REUSE_MATRIX) { 497076ba34aSJunchao Zhang /* Send Aa to abuf */ 4989566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 4999566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 500076ba34aSJunchao Zhang 501076ba34aSJunchao Zhang /* Copy abuf to Ca */ 502076ba34aSJunchao Zhang const MatScalarKokkosView& Ca = C.values; 503076ba34aSJunchao Zhang nrows = dstrowoffset.extent(0); /* Not srcrowoffset[] since it has an extra entry for CSR */ 504076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 505076ba34aSJunchao Zhang PetscInt i = t.league_rank(); 506076ba34aSJunchao Zhang PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 507076ba34aSJunchao Zhang PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 508076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) {Ca(dst+k) = abuf(src+k);}); 509076ba34aSJunchao Zhang }); 510076ba34aSJunchao Zhang } else if (reuse == MAT_INITIAL_MATRIX) { 511076ba34aSJunchao Zhang MPI_Comm comm; 512076ba34aSJunchao Zhang MPI_Request *reqs; 513076ba34aSJunchao Zhang PetscMPIInt tag; 514076ba34aSJunchao Zhang PetscInt Cm; 515076ba34aSJunchao Zhang 5169566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)ownerSF,&comm)); 5179566063dSJacob Faibussowitsch PetscCall(PetscCommGetNewTag(comm,&tag)); 518076ba34aSJunchao Zhang 519076ba34aSJunchao Zhang PetscInt niranks,nranks,nroots,nleaves; 520076ba34aSJunchao Zhang const PetscMPIInt *iranks,*ranks; 521076ba34aSJunchao Zhang const PetscInt *ioffset,*rows,*roffset; /* rows[] contains local indices of rows scattered to me from others. ioffset[] is a CSR on rows[] */ 5229566063dSJacob Faibussowitsch PetscCall(PetscSFSetUp(ownerSF)); 5239566063dSJacob Faibussowitsch PetscCall(PetscSFGetLeafRanks(ownerSF,&niranks,&iranks,&ioffset,&rows)); /* recv info: iranks[] will send rows to me */ 5249566063dSJacob Faibussowitsch PetscCall(PetscSFGetRootRanks(ownerSF,&nranks,&ranks,&roffset,NULL/*rmine*/,NULL/*rremote*/)); /* send info */ 5259566063dSJacob Faibussowitsch PetscCall(PetscSFGetGraph(ownerSF,&nroots,&nleaves,NULL,NULL)); 52608401ef6SPierre Jolivet PetscCheck(nleaves == Am,PETSC_COMM_SELF,PETSC_ERR_PLIB,"ownerSF's nleaves(%" PetscInt_FMT ") != row size of A(%" PetscInt_FMT ")",nleaves,Am); 527076ba34aSJunchao Zhang Cm = nroots; 528076ba34aSJunchao Zhang nrows = ioffset[niranks]; /* # of rows to be received. Might receive same row (each is partial) from different senders */ 529076ba34aSJunchao Zhang 530076ba34aSJunchao Zhang /* Tell owners how long each row I will send */ 531076ba34aSJunchao Zhang PetscInt *srowlens; /* send buf of row lens */ 532076ba34aSJunchao Zhang MatRowMapKokkosViewHost rrowlens_h("rrowoffset_h",nrows+1); /* recv buf of row lens. +1 to make CSR later. Memory might be passed to other views */ 533076ba34aSJunchao Zhang PetscInt *rrowlens = rrowlens_h.data(); 534076ba34aSJunchao Zhang 5359566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(Am,&srowlens,niranks+nranks,&reqs)); 536076ba34aSJunchao Zhang for (i=0; i<Am; i++) srowlens[i] = Ai[i+1] - Ai[i]; 537076ba34aSJunchao Zhang rrowlens[0] = 0; 538076ba34aSJunchao Zhang rrowlens++; /* shift the pointer to make the following expression more readable */ 5399566063dSJacob Faibussowitsch for (i=0; i<niranks; i++)PetscCallMPI(MPI_Irecv(&rrowlens[ioffset[i]],ioffset[i+1]-ioffset[i],MPIU_INT,iranks[i],tag,comm,&reqs[i])); 5409566063dSJacob Faibussowitsch for (i=0; i<nranks; i++) PetscCallMPI(MPI_Isend(&srowlens[roffset[i]],roffset[i+1]-roffset[i],MPIU_INT,ranks[i],tag,comm,&reqs[niranks+i])); 5419566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 542076ba34aSJunchao Zhang 543076ba34aSJunchao Zhang /* Owner builds Ci on host by histogramming rrowlens[] */ 544076ba34aSJunchao Zhang MatRowMapKokkosViewHost Ci_h("i",Cm+1); 545076ba34aSJunchao Zhang Kokkos::deep_copy(Ci_h,0); /* Zero Ci */ 546076ba34aSJunchao Zhang MatRowMapType *Ci_ptr = Ci_h.data(); 547076ba34aSJunchao Zhang 548076ba34aSJunchao Zhang for (i=0; i<nrows; i++) { 549076ba34aSJunchao Zhang r = rows[i]; /* local row id of i-th received row */ 550076ba34aSJunchao Zhang #if defined(PETSC_USE_DEBUG) 551aed4548fSBarry Smith PetscCheck(r >= 0 && r < Cm,PETSC_COMM_SELF,PETSC_ERR_PLIB,"local row id (%" PetscInt_FMT ") is out of range [0,%" PetscInt_FMT ")",r,Cm); 552076ba34aSJunchao Zhang #endif 553076ba34aSJunchao Zhang Ci_ptr[r+1] += rrowlens[i]; /* add to length of row r in C */ 554076ba34aSJunchao Zhang } 555076ba34aSJunchao Zhang for (i=0; i<Cm; i++) Ci_ptr[i+1] += Ci_ptr[i]; /* to CSR format */ 556076ba34aSJunchao Zhang Cnnz = Ci_ptr[Cm]; 557076ba34aSJunchao Zhang 558076ba34aSJunchao Zhang /* For each received row, compute src & dst offsets in memory copying (from recv bufs abuf, jbuf to Ca, Cj) */ 559076ba34aSJunchao Zhang MatRowMapKokkosViewHost dstrowoffset_h("dstrowoffset_h",nrows); 560076ba34aSJunchao Zhang PetscInt *dstrowoffset_hptr = dstrowoffset_h.data(); 561076ba34aSJunchao Zhang PetscInt *currowlens; /* Current row lens. They are temp accumulators for row lens in C, to help build dstrowoffset */ 562076ba34aSJunchao Zhang 5639566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(Cm,&currowlens)); /* Init with zero, to be added to */ 564076ba34aSJunchao Zhang for (i=0; i<nrows; i++) { /* for each row I receive */ 565076ba34aSJunchao Zhang r = rows[i]; /* row id in C */ 566076ba34aSJunchao Zhang dstrowoffset_hptr[i] = Ci_ptr[r] + currowlens[r]; /* dst offset of the new place for each recv'ed row in Ca/Cj */ 567076ba34aSJunchao Zhang currowlens[r] += rrowlens[i]; /* accumulate to length of row r in C */ 568076ba34aSJunchao Zhang } 5699566063dSJacob Faibussowitsch PetscCall(PetscFree(currowlens)); 570076ba34aSJunchao Zhang 571076ba34aSJunchao Zhang rrowlens--; 572076ba34aSJunchao Zhang for (i=0; i<nrows; i++) rrowlens[i+1] += rrowlens[i]; /* Change rrowlens[] to CSR format */ 573076ba34aSJunchao Zhang dstrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),dstrowoffset_h); 574076ba34aSJunchao Zhang srcrowoffset = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),rrowlens_h); /* src offset of each recv'ed row in abuf/jbuf */ 575076ba34aSJunchao Zhang 576076ba34aSJunchao Zhang /* Build the reduceSF, which performs buffer to buffer send/recv */ 577076ba34aSJunchao Zhang PetscInt *sdisp,*rdisp; /* buffer to send offsets of roots, and buffer to recv them */ 5789566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(niranks,&sdisp,nranks,&rdisp)); 579076ba34aSJunchao Zhang for (i=0; i<niranks; i++) sdisp[i] = rrowlens[ioffset[i]]; 5809566063dSJacob Faibussowitsch for (i=0; i<nranks; i++) PetscCallMPI(MPI_Irecv(&rdisp[i],1,MPIU_INT,ranks[i],tag,comm,&reqs[i])); 5819566063dSJacob Faibussowitsch for (i=0; i<niranks; i++) PetscCallMPI(MPI_Isend(&sdisp[i],1,MPIU_INT,iranks[i],tag,comm,&reqs[nranks+i])); 5829566063dSJacob Faibussowitsch PetscCallMPI(MPI_Waitall(niranks+nranks,reqs,MPI_STATUSES_IGNORE)); 583076ba34aSJunchao Zhang 584076ba34aSJunchao Zhang /* Nonzeros in abuf/jbuf are roots and those in A are leaves */ 585076ba34aSJunchao Zhang PetscInt nroots2 = Cnnz,nleaves2 = Annz; 586076ba34aSJunchao Zhang PetscSFNode *iremote; 5879566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nleaves2,&iremote)); /* no free, since memory will be given to reduceSF */ 588076ba34aSJunchao Zhang for (i=0; i<nranks; i++) { 589076ba34aSJunchao Zhang PetscInt rootbase = rdisp[i]; /* root offset at this root rank */ 590076ba34aSJunchao Zhang PetscInt leafbase = Ai[roffset[i]]; /* leaf base */ 591076ba34aSJunchao Zhang PetscInt nz = Ai[roffset[i+1]] - leafbase; /* I will send nz nonzeros to this root rank */ 592076ba34aSJunchao Zhang for (PetscInt k=0; k<nz; k++) { 593076ba34aSJunchao Zhang iremote[leafbase+k].rank = ranks[i]; 594076ba34aSJunchao Zhang iremote[leafbase+k].index = rootbase + k; 595076ba34aSJunchao Zhang } 596076ba34aSJunchao Zhang } 5979566063dSJacob Faibussowitsch PetscCall(PetscSFCreate(comm,&reduceSF)); 5989566063dSJacob Faibussowitsch PetscCall(PetscSFSetGraph(reduceSF,nroots2,nleaves2,NULL,PETSC_OWN_POINTER,iremote,PETSC_OWN_POINTER)); 5999566063dSJacob Faibussowitsch PetscCall(PetscFree2(sdisp,rdisp)); 600076ba34aSJunchao Zhang 601076ba34aSJunchao Zhang /* Reduce Aa, Ajg to abuf and jbuf */ 602076ba34aSJunchao Zhang 603076ba34aSJunchao Zhang /* If A uses local col ids, convert them to global ones before sending */ 604076ba34aSJunchao Zhang MatColIdxKokkosView Ajg; 605076ba34aSJunchao Zhang if (local) { 606076ba34aSJunchao Zhang Ajg = MatColIdxKokkosView("j",Annz); 607076ba34aSJunchao Zhang const MatColIdxKokkosView& Aj = akok->j_dual.view_device(); 608076ba34aSJunchao Zhang Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscInt i) {Ajg(i) = l2g(Aj(i));}); 609076ba34aSJunchao Zhang } else { 610076ba34aSJunchao Zhang Ajg = akok->j_dual.view_device(); /* no data copy, just take a reference */ 611076ba34aSJunchao Zhang } 612076ba34aSJunchao Zhang 613076ba34aSJunchao Zhang MatColIdxKokkosView jbuf("jbuf",Cnnz); 614076ba34aSJunchao Zhang abuf = MatScalarKokkosView("abuf",Cnnz); 6159566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE)); 6169566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_INT, Ajg.data(), jbuf.data(),MPI_REPLACE)); 6179566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceBegin(reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 6189566063dSJacob Faibussowitsch PetscCallMPI(PetscSFReduceEnd (reduceSF,MPIU_SCALAR,akok->a_device_data(),abuf.data(),MPI_REPLACE)); 619076ba34aSJunchao Zhang 620076ba34aSJunchao Zhang /* Copy data from abuf, jbuf to Ca, Cj */ 621076ba34aSJunchao Zhang MatRowMapKokkosView Ci = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),Ci_h); /* Ci is an alias of Ci_h if no device */ 622076ba34aSJunchao Zhang MatColIdxKokkosView Cj("j",Cnnz); 623076ba34aSJunchao Zhang MatScalarKokkosView Ca("a",Cnnz); 624076ba34aSJunchao Zhang 625076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(nrows, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 626076ba34aSJunchao Zhang PetscInt i = t.league_rank(); 627076ba34aSJunchao Zhang PetscInt src = srcrowoffset(i), dst = dstrowoffset(i); 628076ba34aSJunchao Zhang PetscInt len = srcrowoffset(i+1) - srcrowoffset(i); 629076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t,len), [&](PetscInt k) { 630076ba34aSJunchao Zhang Ca(dst+k) = abuf(src+k); 631076ba34aSJunchao Zhang Cj(dst+k) = jbuf(src+k); 632076ba34aSJunchao Zhang }); 633076ba34aSJunchao Zhang }); 634076ba34aSJunchao Zhang 635076ba34aSJunchao Zhang /* Build C with Ca, Ci, Cj */ 636076ba34aSJunchao Zhang C = KokkosCsrMatrix("csrmat",Cm,N,Cnnz,Ca,Ci,Cj); 6379566063dSJacob Faibussowitsch PetscCall(PetscFree2(srowlens,reqs)); 63898921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unsupported MatReuse enum %d",reuse); 639076ba34aSJunchao Zhang PetscFunctionReturn(0); 640076ba34aSJunchao Zhang } 641076ba34aSJunchao Zhang 642076ba34aSJunchao Zhang /* MatSetMPIAIJKokkosWithGlobalCSRMatrix - Set the diag and offdiag parts of a MATMPIAIJKOKKOS matrix by splitting a KokkosCsrMatrix 643076ba34aSJunchao Zhang 644076ba34aSJunchao Zhang Input Parameters: 645076ba34aSJunchao Zhang + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 646076ba34aSJunchao Zhang . reuse - indicate whether the matrix has called this function before 647076ba34aSJunchao Zhang . csrmat - the KokkosCsrMatrix, of size m,N 648076ba34aSJunchao Zhang - Cdstart - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the start of the first 649076ba34aSJunchao Zhang entry of the diag block of C in csrmat's j array. E.g, if row i has col ids = {0, 3, 4, 5, 7, 9} and the first diag 650076ba34aSJunchao Zhang entry is 5, then Cdstart[i] = 3. 651076ba34aSJunchao Zhang 652076ba34aSJunchao Zhang Output Parameters: 653076ba34aSJunchao Zhang + C - the updated MATMPIAIJKOKKOS matrix 654076ba34aSJunchao Zhang - Cdstart - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 655076ba34aSJunchao Zhang 656076ba34aSJunchao Zhang Notes: 657076ba34aSJunchao Zhang Between calls with MAT_INITIAL_MATRIX or MAT_REUSE_MATRIX, csrmat must have the same nonzero pattern 658076ba34aSJunchao Zhang */ 659076ba34aSJunchao Zhang static PetscErrorCode MatSetMPIAIJKokkosWithGlobalCSRMatrix(Mat C,MatReuse reuse,const KokkosCsrMatrix& csrmat,MatRowMapKokkosView& Cdstart) 660076ba34aSJunchao Zhang { 661076ba34aSJunchao Zhang const MatScalarKokkosView& Ca = csrmat.values; 662076ba34aSJunchao Zhang const ConstMatRowMapKokkosView& Ci = csrmat.graph.row_map; 663076ba34aSJunchao Zhang PetscInt m,n,N; 664076ba34aSJunchao Zhang 665076ba34aSJunchao Zhang PetscFunctionBegin; 6669566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(C,&m,&n)); 6679566063dSJacob Faibussowitsch PetscCall(MatGetSize(C,NULL,&N)); 668076ba34aSJunchao Zhang 669076ba34aSJunchao Zhang if (reuse == MAT_REUSE_MATRIX) { 670076ba34aSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(C->data); 671076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->A->spptr); 672076ba34aSJunchao Zhang Mat_SeqAIJKokkos *bkok = static_cast<Mat_SeqAIJKokkos*>(mpiaij->B->spptr); 673076ba34aSJunchao Zhang const MatScalarKokkosView& Cda = akok->a_dual.view_device(),Coa = bkok->a_dual.view_device(); 674076ba34aSJunchao Zhang const MatRowMapKokkosView& Cdi = akok->i_dual.view_device(),Coi = bkok->i_dual.view_device(); 675076ba34aSJunchao Zhang 676076ba34aSJunchao Zhang /* Fill 'a' of Cd and Co on device */ 677076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 678076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 679076ba34aSJunchao Zhang PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 680076ba34aSJunchao Zhang PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 681076ba34aSJunchao Zhang PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 682076ba34aSJunchao Zhang PetscInt cdend = cdstart + cdlen; 683076ba34aSJunchao Zhang /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 684076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 685076ba34aSJunchao Zhang if (k < cdstart) { /* k in [0, cdstart) */ 686076ba34aSJunchao Zhang Coa(Coi(i)+k) = Ca(Ci(i)+k); 687076ba34aSJunchao Zhang } else if (k < cdend) { /* k in [cdstart, cdend) */ 688076ba34aSJunchao Zhang Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 689076ba34aSJunchao Zhang } else { /* k in [cdend, clen) */ 690076ba34aSJunchao Zhang Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 691076ba34aSJunchao Zhang } 692076ba34aSJunchao Zhang }); 693076ba34aSJunchao Zhang }); 694076ba34aSJunchao Zhang 695076ba34aSJunchao Zhang akok->a_dual.modify_device(); 696076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 697076ba34aSJunchao Zhang } else if (reuse == MAT_INITIAL_MATRIX) { 698076ba34aSJunchao Zhang Mat Cd,Co; 699076ba34aSJunchao Zhang const MatColIdxKokkosView& Cj = csrmat.graph.entries; 700076ba34aSJunchao Zhang MatRowMapKokkosDualView Cdi_dual("i",m+1),Coi_dual("i",m+1); 701076ba34aSJunchao Zhang MatRowMapKokkosView Cdi = Cdi_dual.view_device(),Coi = Coi_dual.view_device(); 702076ba34aSJunchao Zhang PetscInt cstart,cend; 703076ba34aSJunchao Zhang 704076ba34aSJunchao Zhang /* Note that each row of C is sorted by col ids. We want to find out how to cut each row into three blocks: 705076ba34aSJunchao Zhang left to the diag block, diag block, right to the diag block. The diag block have col ids in [cstart,cend). 706076ba34aSJunchao Zhang Suppose a row of C has len nonzeros, indexed by [0, len). We want to know two indices: cdstart and cdend, 707076ba34aSJunchao Zhang such that the three blocks are [0,cdstart), [cdstart,cdend), [cdend,len). The following code equivalentaly 708076ba34aSJunchao Zhang stores values of cdstart and cdend-cstart (aka Cdi[]) instead. 709076ba34aSJunchao Zhang */ 710076ba34aSJunchao Zhang Cdstart = MatRowMapKokkosView("Cdstart",m); 7119566063dSJacob Faibussowitsch PetscCall(PetscLayoutGetRange(C->cmap,&cstart,&cend)); /* Not MatGetOwnershipRangeColumn() since C has not been preallocated yet */ 712076ba34aSJunchao Zhang 713076ba34aSJunchao Zhang /* I could use RangePolicy and one thread per row. But since each thread essentially does binary search, threads in a 714076ba34aSJunchao Zhang CUDA warp would completely diverge. So I use TeamPolicy with a team size 1. 715076ba34aSJunchao Zhang */ 716076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 717076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 718076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 719076ba34aSJunchao Zhang PetscInt j,first,count,step; 720076ba34aSJunchao Zhang 721076ba34aSJunchao Zhang if (i == 0) { /* Set the first entry of the i arrays to zero on device, to be used in CSR */ 722076ba34aSJunchao Zhang Cdi(0) = 0; 723076ba34aSJunchao Zhang Coi(0) = 0; 724076ba34aSJunchao Zhang } 725076ba34aSJunchao Zhang 726076ba34aSJunchao Zhang /* Do std::lower_bound(Ci(i),Ci(i+1),cstart) on Cj[]. We use j as the iterator. lower_bound() returns 727076ba34aSJunchao Zhang in 'first' the first iterator with a value >= cstart, or last iterator if no such element is found. 728076ba34aSJunchao Zhang */ 729076ba34aSJunchao Zhang count = Ci(i+1)-Ci(i); 730076ba34aSJunchao Zhang first = Ci(i); 731076ba34aSJunchao Zhang while (count > 0) { 732076ba34aSJunchao Zhang j = first; 733076ba34aSJunchao Zhang step = count / 2; 734076ba34aSJunchao Zhang j += step; 735076ba34aSJunchao Zhang if (Cj(j) < cstart) { 736076ba34aSJunchao Zhang first = ++j; 737076ba34aSJunchao Zhang count -= step + 1; 738076ba34aSJunchao Zhang } else count = step; 739076ba34aSJunchao Zhang } 740076ba34aSJunchao Zhang Cdstart(i) = first - Ci(i); /* 'first' is the while-loop's output */ 741076ba34aSJunchao Zhang 742076ba34aSJunchao Zhang /* Do std::lower_bound(first,Ci(i+1),cend) on Cj[] */ 743076ba34aSJunchao Zhang count = Ci(i+1) - first; 744076ba34aSJunchao Zhang while (count > 0) { 745076ba34aSJunchao Zhang j = first; 746076ba34aSJunchao Zhang step = count / 2; 747076ba34aSJunchao Zhang j += step; 748076ba34aSJunchao Zhang if (Cj(j) < cend) { 749076ba34aSJunchao Zhang first = ++j; 750076ba34aSJunchao Zhang count -= step + 1; 751076ba34aSJunchao Zhang } else count = step; 752076ba34aSJunchao Zhang } 753076ba34aSJunchao Zhang Cdi(i+1) = first - (Ci(i)+Cdstart(i)); /* 'first' is the while-loop's output */ 754076ba34aSJunchao Zhang Coi(i+1) = (Ci(i+1)-Ci(i)) - Cdi(i+1); /* Co's row len = C's row len - Cd's row len */ 755076ba34aSJunchao Zhang }); 756076ba34aSJunchao Zhang }); 757076ba34aSJunchao Zhang 758076ba34aSJunchao Zhang /* Convert row lens in Cdi[], Coi[] to CSR format using inclusive scan, e.g., changing [0,1,2,3] into [0,1,3,6] */ 759076ba34aSJunchao Zhang Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 760076ba34aSJunchao Zhang update += Cdi(i); 761076ba34aSJunchao Zhang if (final) Cdi(i) = update; 762076ba34aSJunchao Zhang }); 763076ba34aSJunchao Zhang Kokkos::parallel_scan(m+1,KOKKOS_LAMBDA(const PetscInt i,PetscInt& update,const bool final) { 764076ba34aSJunchao Zhang update += Coi(i); 765076ba34aSJunchao Zhang if (final) Coi(i) = update; 766076ba34aSJunchao Zhang }); 767076ba34aSJunchao Zhang 768076ba34aSJunchao Zhang /* Get Cdi, Coi on host (it is not a waste, since we do need them on host in 769076ba34aSJunchao Zhang MatCreateSeqAIJKokkosWithCSRMatrix() below), then get nnz of Cd and Co. 770076ba34aSJunchao Zhang */ 771076ba34aSJunchao Zhang Cdi_dual.modify_device(); 772076ba34aSJunchao Zhang Coi_dual.modify_device(); 773076ba34aSJunchao Zhang Cdi_dual.sync_host(); 774076ba34aSJunchao Zhang Coi_dual.sync_host(); 775076ba34aSJunchao Zhang PetscInt Cd_nnz = Cdi_dual.view_host().data()[m]; 776076ba34aSJunchao Zhang PetscInt Co_nnz = Coi_dual.view_host().data()[m]; 777076ba34aSJunchao Zhang 778076ba34aSJunchao Zhang /* With nnz, allocate a, j for Cd and Co */ 779076ba34aSJunchao Zhang MatColIdxKokkosDualView Cdj_dual("j",Cd_nnz),Coj_dual("j",Co_nnz); 780076ba34aSJunchao Zhang MatScalarKokkosDualView Cda_dual("a",Cd_nnz),Coa_dual("a",Co_nnz); 781076ba34aSJunchao Zhang 782076ba34aSJunchao Zhang /* Fill a, j of Cd and Co on device */ 783076ba34aSJunchao Zhang MatColIdxKokkosView Cdj = Cdj_dual.view_device(),Coj = Coj_dual.view_device(); 784076ba34aSJunchao Zhang MatScalarKokkosView Cda = Cda_dual.view_device(),Coa = Coa_dual.view_device(); 785076ba34aSJunchao Zhang 786076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 787076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 788076ba34aSJunchao Zhang PetscInt clen = Ci(i+1) - Ci(i); /* len of row i of C */ 789076ba34aSJunchao Zhang PetscInt cdlen = Cdi(i+1) - Cdi(i); /* len of row i of Cd */ 790076ba34aSJunchao Zhang PetscInt cdstart = Cdstart(i); /* [start, end) of row i of Cd in C */ 791076ba34aSJunchao Zhang PetscInt cdend = cdstart + cdlen; 792076ba34aSJunchao Zhang /* [0, clen) is cut into three blocks: [0, cdstart), [cdstart, cdend), [cdend, clen) */ 793076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, clen), [&](PetscInt k) { 794076ba34aSJunchao Zhang if (k < cdstart) { /* k in [0, cdstart) */ 795076ba34aSJunchao Zhang Coa(Coi(i)+k) = Ca(Ci(i)+k); 796076ba34aSJunchao Zhang Coj(Coi(i)+k) = Cj(Ci(i)+k); 797076ba34aSJunchao Zhang } else if (k < cdend) { /* k in [cdstart, cdend) */ 798076ba34aSJunchao Zhang Cda(Cdi(i)+(k-cdstart)) = Ca(Ci(i)+k); 799076ba34aSJunchao Zhang Cdj(Cdi(i)+(k-cdstart)) = Cj(Ci(i)+k) - cstart; /* Use local col ids in Cdj */ 800076ba34aSJunchao Zhang } else { /* k in [cdend, clen) */ 801076ba34aSJunchao Zhang Coa(Coi(i)+k-cdlen) = Ca(Ci(i)+k); 802076ba34aSJunchao Zhang Coj(Coi(i)+k-cdlen) = Cj(Ci(i)+k); 803076ba34aSJunchao Zhang } 804076ba34aSJunchao Zhang }); 805076ba34aSJunchao Zhang }); 806076ba34aSJunchao Zhang 807076ba34aSJunchao Zhang Cdj_dual.modify_device(); 808076ba34aSJunchao Zhang Cda_dual.modify_device(); 809076ba34aSJunchao Zhang Coj_dual.modify_device(); 810076ba34aSJunchao Zhang Coa_dual.modify_device(); 811076ba34aSJunchao Zhang /* With a, i, j for Cd and Co, finally build Cd, Co and then C. Their offloadmask will be set in each's MatAssemblyEnd */ 812076ba34aSJunchao Zhang auto cdkok = new Mat_SeqAIJKokkos(m,n,Cd_nnz,Cdi_dual,Cdj_dual,Cda_dual); 813076ba34aSJunchao Zhang auto cokok = new Mat_SeqAIJKokkos(m,N,Co_nnz,Coi_dual,Coj_dual,Coa_dual); 8149566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cdkok,&Cd)); 8159566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,cokok,&Co)); 8169566063dSJacob Faibussowitsch PetscCall(MatSetMPIAIJKokkosWithSplitSeqAIJKokkosMatrices(C,Cd,Co)); /* Coj will be converted to local ids within */ 817076ba34aSJunchao Zhang } 818076ba34aSJunchao Zhang PetscFunctionReturn(0); 819076ba34aSJunchao Zhang } 820076ba34aSJunchao Zhang 821076ba34aSJunchao Zhang /* MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos - Compact a SEQAIJKOKKS matrix's global col ids. 822076ba34aSJunchao Zhang 823076ba34aSJunchao Zhang It is similar to MatSeqAIJCompactOutExtraColumns_SeqAIJ, but it applies to SEQAIJKOKKOS and returns the l2g map in Kokkos view. 824076ba34aSJunchao Zhang 825076ba34aSJunchao Zhang Input Parameters: 826076ba34aSJunchao Zhang + C - the MATMPIAIJKOKKOS matrix, of size m,n,M,N 827076ba34aSJunchao Zhang . reuse - indicate whether the matrix has called this function before 828076ba34aSJunchao Zhang . csrmat - the KokkosCsrMatrix, of size m,N 829076ba34aSJunchao Zhang - Cdoffset - when reuse == MAT_REUSE_MATRIX, it is an input parameter. For each row in csrmat, it stores the offset of the first 830076ba34aSJunchao Zhang entry of the diag block of C in csrmat's j array. 831076ba34aSJunchao Zhang 832076ba34aSJunchao Zhang Output Parameters: 833076ba34aSJunchao Zhang + C - the updated MATMPIAIJKOKKOS matrix 834076ba34aSJunchao Zhang - Cdoffset - when reuse == MAT_INITIAL_MATRIX, it is an output parameter 835076ba34aSJunchao Zhang 836076ba34aSJunchao Zhang Notes: the input matrix's col ids and col size will be changed. 837076ba34aSJunchao Zhang */ 838076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(Mat C,MatColIdxKokkosView& l2g) 839076ba34aSJunchao Zhang { 840076ba34aSJunchao Zhang Mat_SeqAIJKokkos *ckok; 841076ba34aSJunchao Zhang ISLocalToGlobalMapping l2gmap; 842076ba34aSJunchao Zhang const PetscInt *garray; 843076ba34aSJunchao Zhang PetscInt sz; 844076ba34aSJunchao Zhang 845076ba34aSJunchao Zhang PetscFunctionBegin; 846076ba34aSJunchao Zhang /* Compact P_other's global col ids and col size. We do it since we guess with local ids KK might be more memory scalable */ 8479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJ(C,&l2gmap)); 848076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 849076ba34aSJunchao Zhang ckok->j_dual.modify_host(); /* P_other's j is modified on host; we need to sync it on device */ 850076ba34aSJunchao Zhang ckok->j_dual.sync_device(); 851076ba34aSJunchao Zhang ckok->SetColSize(C->cmap->n); /* Update col size of the csrmat in spptr */ 852076ba34aSJunchao Zhang 853076ba34aSJunchao Zhang /* Build l2g -- the local to global mapping of C's cols */ 8549566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingGetIndices(l2gmap,&garray)); 8559566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingGetSize(l2gmap,&sz)); 85608401ef6SPierre Jolivet PetscCheck(C->cmap->n == sz,PETSC_COMM_SELF,PETSC_ERR_PLIB,"matrix column size(%" PetscInt_FMT ") != l2g mapping size(%" PetscInt_FMT ")", C->cmap->n,sz); 857076ba34aSJunchao Zhang 858076ba34aSJunchao Zhang ConstMatColIdxKokkosViewHost tmp(garray,sz); 859076ba34aSJunchao Zhang l2g = MatColIdxKokkosView("l2g",sz); 860076ba34aSJunchao Zhang Kokkos::deep_copy(l2g,tmp); 861076ba34aSJunchao Zhang 8629566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingRestoreIndices(l2gmap,&garray)); 8639566063dSJacob Faibussowitsch PetscCall(ISLocalToGlobalMappingDestroy(&l2gmap)); 864076ba34aSJunchao Zhang PetscFunctionReturn(0); 865076ba34aSJunchao Zhang } 866076ba34aSJunchao Zhang 867076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AB - AB flavor of MatProductSymbolic_MPIAIJKokkos 868076ba34aSJunchao Zhang 869076ba34aSJunchao Zhang Input Parameters: 870076ba34aSJunchao Zhang + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 871076ba34aSJunchao Zhang . A - an MPIAIJKOKKOS matrix 872076ba34aSJunchao Zhang . B - an MPIAIJKOKKOS matrix 873076ba34aSJunchao Zhang - mm - a struct used to stash intermediate data when computing AB. Persist from symbolic to numeric operations. 874076ba34aSJunchao Zhang 875076ba34aSJunchao Zhang Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 876076ba34aSJunchao Zhang */ 877076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AB(Mat_Product *product,Mat A,Mat B,MatMatStruct_AB *mm) 878076ba34aSJunchao Zhang { 879076ba34aSJunchao Zhang Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 880076ba34aSJunchao Zhang Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 881076ba34aSJunchao Zhang IS glob = NULL; 882076ba34aSJunchao Zhang const PetscInt *garray; 883076ba34aSJunchao Zhang PetscInt N = B->cmap->N,sz; 884076ba34aSJunchao Zhang ConstMatColIdxKokkosView l2g1; /* two temp maps mapping local col ids to global ones */ 885076ba34aSJunchao Zhang MatColIdxKokkosView l2g2; 886076ba34aSJunchao Zhang Mat C1,C2; /* intermediate matrices */ 887076ba34aSJunchao Zhang 888076ba34aSJunchao Zhang PetscFunctionBegin; 889076ba34aSJunchao Zhang /* C1 = Ad * B_local. B_local is a matrix got by merging Bd and Bo, and uses local col ids */ 8909566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&mm->B_local)); 8919566063dSJacob Faibussowitsch PetscCall(MatProductCreate(Ad,mm->B_local,NULL,&C1)); 8929566063dSJacob Faibussowitsch PetscCall(MatProductSetType(C1,MATPRODUCT_AB)); 8939566063dSJacob Faibussowitsch PetscCall(MatProductSetFill(C1,product->fill)); 894076ba34aSJunchao Zhang C1->product->api_user = product->api_user; 8959566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions(C1)); 8965f80ce2aSJacob Faibussowitsch PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 8979566063dSJacob Faibussowitsch PetscCall((*C1->ops->productsymbolic)(C1)); 898076ba34aSJunchao Zhang 8999566063dSJacob Faibussowitsch PetscCall(ISGetIndices(glob,&garray)); 9009566063dSJacob Faibussowitsch PetscCall(ISGetSize(glob,&sz)); 901076ba34aSJunchao Zhang const auto& tmp = ConstMatColIdxKokkosViewHost(garray,sz); /* wrap garray as a view */ 902076ba34aSJunchao Zhang l2g1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),tmp); /* maybe just an alias to tmp, so we restore garray at the very end */ 9039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g1,mm->C1_global)); 904076ba34aSJunchao Zhang 905076ba34aSJunchao Zhang /* C2 = Ao * B_other. B_other is a matrix consisting of needed rows of B gathered from other procs */ 9069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosBcast(mm->B_local,MAT_INITIAL_MATRIX,N,l2g1,a->Mvctx,mm->sf,mm->abuf,mm->rows,mm->rowoffset,mm->B_other)); 907076ba34aSJunchao Zhang 908076ba34aSJunchao Zhang /* Compact B_other to use local ids as we guess KK spgemm is more memroy scalable with that; We could skip the compaction to simplify code */ 9099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCompactOutExtraColumns_SeqAIJKokkos(mm->B_other,l2g2)); 9109566063dSJacob Faibussowitsch PetscCall(MatProductCreate(Ao,mm->B_other,NULL,&C2)); 9119566063dSJacob Faibussowitsch PetscCall(MatProductSetType(C2,MATPRODUCT_AB)); 9129566063dSJacob Faibussowitsch PetscCall(MatProductSetFill(C2,product->fill)); 913076ba34aSJunchao Zhang C2->product->api_user = product->api_user; 9149566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions(C2)); 9155f80ce2aSJacob Faibussowitsch PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 9169566063dSJacob Faibussowitsch PetscCall((*C2->ops->productsymbolic)(C2)); 9179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C2,N,l2g2,mm->C2_global)); 918076ba34aSJunchao Zhang 919076ba34aSJunchao Zhang /* C = C1 + C2. We actually use their global col ids versions in adding */ 920076ba34aSJunchao Zhang mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B_local, B_other are not */ 921076ba34aSJunchao Zhang KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 922076ba34aSJunchao Zhang /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 923076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 924076ba34aSJunchao Zhang 925076ba34aSJunchao Zhang mm->C1 = C1; 926076ba34aSJunchao Zhang mm->C2 = C2; 9279566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(glob,&garray)); 9289566063dSJacob Faibussowitsch PetscCall(ISDestroy(&glob)); 929076ba34aSJunchao Zhang PetscFunctionReturn(0); 930076ba34aSJunchao Zhang } 931076ba34aSJunchao Zhang 932076ba34aSJunchao Zhang /* MatProductSymbolic_MPIAIJKokkos_AtB - A^tB flavor of MatProductSymbolic_MPIAIJKokkos 933076ba34aSJunchao Zhang 934076ba34aSJunchao Zhang Input Parameters: 935076ba34aSJunchao Zhang + product - Mat_Product which carried out the computation. Passed in to access info about this mat product. 936076ba34aSJunchao Zhang . A - an MPIAIJKOKKOS matrix 937076ba34aSJunchao Zhang . B - a SEQAIJKOKKOS matrix. It works as if A^t is multiplied by a parallel matrix made up of Bs on each rank. 938076ba34aSJunchao Zhang . localB - Does B use local col ids? If false, then B is already in global col ids. 939076ba34aSJunchao Zhang . N - col size of the "parallel B matrix". It implies B's global col ids are in range of [0,N) and N is the same across the communicator. 940076ba34aSJunchao Zhang . l2g - If localB, then l2g maps B's local col ids to global ones. 941076ba34aSJunchao Zhang - mm - a struct used to stash intermediate data in AtB 942076ba34aSJunchao Zhang 943076ba34aSJunchao Zhang Notes: The local part of the result C is stored as mm->C_global, which is of type KokkosCsrMatrix and uses global col ids. 944076ba34aSJunchao Zhang */ 945076ba34aSJunchao Zhang static PetscErrorCode MatProductSymbolic_MPIAIJKokkos_AtB(Mat_Product *product,Mat A,Mat B,PetscBool localB,PetscInt N,const ConstMatColIdxKokkosView& l2g,MatMatStruct_AtB *mm) 946076ba34aSJunchao Zhang { 947076ba34aSJunchao Zhang Mat_MPIAIJ *a = static_cast<Mat_MPIAIJ*>(A->data); 948076ba34aSJunchao Zhang Mat Ad = a->A,Ao = a->B; /* diag and offdiag of A */ 949076ba34aSJunchao Zhang Mat C1,C2; /* intermediate matrices */ 950076ba34aSJunchao Zhang 951076ba34aSJunchao Zhang PetscFunctionBegin; 952076ba34aSJunchao Zhang /* C1 = Ad^t * B */ 9539566063dSJacob Faibussowitsch PetscCall(MatProductCreate(Ad,B,NULL,&C1)); 9549566063dSJacob Faibussowitsch PetscCall(MatProductSetType(C1,MATPRODUCT_AtB)); 9559566063dSJacob Faibussowitsch PetscCall(MatProductSetFill(C1,product->fill)); 956076ba34aSJunchao Zhang C1->product->api_user = product->api_user; 9579566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions(C1)); 9585f80ce2aSJacob Faibussowitsch PetscCheck(C1->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C1->product->type]); 9599566063dSJacob Faibussowitsch PetscCall((*C1->ops->productsymbolic)(C1)); 960076ba34aSJunchao Zhang 9619566063dSJacob Faibussowitsch if (localB) PetscCall(MatSeqAIJKokkosGetCSRMatrixWithGlobalColumnIds(C1,N,l2g,mm->C1_global)); 962076ba34aSJunchao Zhang else mm->C1_global = static_cast<Mat_SeqAIJKokkos*>(C1->spptr)->csrmat; /* the csrmat already uses global col ids */ 963076ba34aSJunchao Zhang 964076ba34aSJunchao Zhang /* C2 = Ao^t * B */ 9659566063dSJacob Faibussowitsch PetscCall(MatProductCreate(Ao,B,NULL,&C2)); 9669566063dSJacob Faibussowitsch PetscCall(MatProductSetType(C2,MATPRODUCT_AtB)); 9679566063dSJacob Faibussowitsch PetscCall(MatProductSetFill(C2,product->fill)); 968076ba34aSJunchao Zhang C2->product->api_user = product->api_user; 9699566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions(C2)); 9705f80ce2aSJacob Faibussowitsch PetscCheck(C2->ops->productsymbolic,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[C2->product->type]); 9719566063dSJacob Faibussowitsch PetscCall((*C2->ops->productsymbolic)(C2)); 972076ba34aSJunchao Zhang 9739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosReduce(C2,MAT_INITIAL_MATRIX,localB,N,l2g,a->Mvctx,mm->sf,mm->abuf,mm->srcrowoffset,mm->dstrowoffset,mm->C2_global)); 974076ba34aSJunchao Zhang 975076ba34aSJunchao Zhang mm->kh.create_spadd_handle(false); /* Input C1, C2 are NOT sorted, since B may be not */ 976076ba34aSJunchao Zhang KokkosSparse::spadd_symbolic(&mm->kh,mm->C1_global,mm->C2_global,mm->C_global); 977076ba34aSJunchao Zhang /* Have to do numeric since spadd_symbolic does not really populate column indices of the result matrix */ 978076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&mm->kh,(MatScalarType)1.0,mm->C1_global,(MatScalarType)1.0,mm->C2_global,mm->C_global); 979076ba34aSJunchao Zhang mm->C1 = C1; 980076ba34aSJunchao Zhang mm->C2 = C2; 981076ba34aSJunchao Zhang PetscFunctionReturn(0); 982076ba34aSJunchao Zhang } 983076ba34aSJunchao Zhang 984076ba34aSJunchao Zhang PetscErrorCode MatProductNumeric_MPIAIJKokkos(Mat C) 985076ba34aSJunchao Zhang { 986076ba34aSJunchao Zhang Mat_Product *product = C->product; 987076ba34aSJunchao Zhang MatProductType ptype; 988076ba34aSJunchao Zhang MatProductData_MPIAIJKokkos *mmdata; 989076ba34aSJunchao Zhang MatMatStruct *mm = NULL; 990076ba34aSJunchao Zhang MatMatStruct_AB *ab; 991076ba34aSJunchao Zhang MatMatStruct_AtB *atb; 992076ba34aSJunchao Zhang Mat A,B,Ad,Ao,Bd,Bo; 993076ba34aSJunchao Zhang const MatScalarType one = 1.0; /* Not use literal 1.0 directly, to avoid wrong template instantiation in KokkosSparse::spadd_numeric */ 994076ba34aSJunchao Zhang 995076ba34aSJunchao Zhang PetscFunctionBegin; 996076ba34aSJunchao Zhang MatCheckProduct(C,1); 997076ba34aSJunchao Zhang mmdata = static_cast<MatProductData_MPIAIJKokkos*>(product->data); 998076ba34aSJunchao Zhang ptype = product->type; 999076ba34aSJunchao Zhang A = product->A; 1000076ba34aSJunchao Zhang B = product->B; 1001076ba34aSJunchao Zhang Ad = static_cast<Mat_MPIAIJ*>(A->data)->A; 1002076ba34aSJunchao Zhang Ao = static_cast<Mat_MPIAIJ*>(A->data)->B; 1003076ba34aSJunchao Zhang Bd = static_cast<Mat_MPIAIJ*>(B->data)->A; 1004076ba34aSJunchao Zhang Bo = static_cast<Mat_MPIAIJ*>(B->data)->B; 1005076ba34aSJunchao Zhang 1006076ba34aSJunchao Zhang if (mmdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 1007076ba34aSJunchao Zhang mmdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 1008076ba34aSJunchao Zhang ab = mmdata->mmAB; 1009076ba34aSJunchao Zhang atb = mmdata->mmAtB; 1010076ba34aSJunchao Zhang if (ab) { 1011076ba34aSJunchao Zhang static_cast<MatProductData_SeqAIJKokkos*>(ab->C1->product->data)->reusesym = PETSC_FALSE; 1012076ba34aSJunchao Zhang static_cast<MatProductData_SeqAIJKokkos*>(ab->C2->product->data)->reusesym = PETSC_FALSE; 1013076ba34aSJunchao Zhang } 1014076ba34aSJunchao Zhang if (atb) { 1015076ba34aSJunchao Zhang static_cast<MatProductData_SeqAIJKokkos*>(atb->C1->product->data)->reusesym = PETSC_FALSE; 1016076ba34aSJunchao Zhang static_cast<MatProductData_SeqAIJKokkos*>(atb->C2->product->data)->reusesym = PETSC_FALSE; 1017076ba34aSJunchao Zhang } 1018076ba34aSJunchao Zhang PetscFunctionReturn(0); 1019076ba34aSJunchao Zhang } 1020076ba34aSJunchao Zhang 1021076ba34aSJunchao Zhang if (ptype == MATPRODUCT_AB) { 1022076ba34aSJunchao Zhang ab = mmdata->mmAB; 1023076ba34aSJunchao Zhang /* C1 = Ad * B_local */ 102408401ef6SPierre Jolivet PetscCheck(ab->C1->ops->productnumeric && ab->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AB"); 10259566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local)); 10265f80ce2aSJacob Faibussowitsch PetscCheck(ab->C1->product->B == ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C1->B has unexpectedly changed"); 10279566063dSJacob Faibussowitsch if (ab->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,ab->C1)); 10289566063dSJacob Faibussowitsch PetscCall((*ab->C1->ops->productnumeric)(ab->C1)); 10299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 10305f80ce2aSJacob Faibussowitsch ab->abuf,ab->rows,ab->rowoffset,ab->B_other)); 1031076ba34aSJunchao Zhang /* C2 = Ao * B_other */ 103208401ef6SPierre Jolivet PetscCheck(ab->C2->product->B == ab->B_other,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AB, internal mat product matrix C2->B has unexpectedly changed"); 10339566063dSJacob Faibussowitsch if (ab->C1->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,ab->C2)); 10349566063dSJacob Faibussowitsch PetscCall((*ab->C2->ops->productnumeric)(ab->C2)); 1035076ba34aSJunchao Zhang /* C = C1_global + C2_global */ 1036076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1037076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(ab); 1038076ba34aSJunchao Zhang } else if (ptype == MATPRODUCT_AtB) { 1039076ba34aSJunchao Zhang atb = mmdata->mmAtB; 104008401ef6SPierre Jolivet PetscCheck(atb->C1->ops->productnumeric && atb->C2->ops->productnumeric,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing numeric op for MATPRODUCT_AtB"); 1041076ba34aSJunchao Zhang /* C1 = Ad^t * B_local */ 10429566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&atb->B_local)); 104308401ef6SPierre Jolivet PetscCheck(atb->C1->product->B == atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C1->B has unexpectedly changed"); 10449566063dSJacob Faibussowitsch if (atb->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,atb->C1)); 10459566063dSJacob Faibussowitsch PetscCall((*atb->C1->ops->productnumeric)(atb->C1)); 1046076ba34aSJunchao Zhang 1047076ba34aSJunchao Zhang /* C2 = Ao^t * B_local */ 104808401ef6SPierre Jolivet PetscCheck(atb->C2->product->B == atb->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_AtB, internal mat product matrix C2->B has unexpectedly changed"); 10499566063dSJacob Faibussowitsch if (atb->C2->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,atb->C2)); 10509566063dSJacob Faibussowitsch PetscCall((*atb->C2->ops->productnumeric)(atb->C2)); 1051076ba34aSJunchao Zhang /* Form C2_global */ 10529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_TRUE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 10535f80ce2aSJacob Faibussowitsch atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global)); 1054076ba34aSJunchao Zhang /* C = C1_global + C2_global */ 1055076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1056076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(atb); 1057076ba34aSJunchao Zhang } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1058076ba34aSJunchao Zhang ab = mmdata->mmAB; 10599566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_REUSE_MATRIX,NULL/*glob*/,&ab->B_local)); 1060076ba34aSJunchao Zhang 1061076ba34aSJunchao Zhang /* ab->C1 = Ad * B_local */ 106208401ef6SPierre Jolivet PetscCheck(ab->C1->product->B == ab->B_local,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix ab->C1->B has unexpectedly changed"); 10639566063dSJacob Faibussowitsch if (ab->C1->product->A != Ad) PetscCall(MatProductReplaceMats(Ad,NULL,NULL,ab->C1)); 10649566063dSJacob Faibussowitsch PetscCall((*ab->C1->ops->productnumeric)(ab->C1)); 10659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosBcast(ab->B_local,MAT_REUSE_MATRIX,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,ab->sf, 10665f80ce2aSJacob Faibussowitsch ab->abuf,ab->rows,ab->rowoffset,ab->B_other)); 1067076ba34aSJunchao Zhang /* ab->C2 = Ao * B_other */ 10689566063dSJacob Faibussowitsch if (ab->C2->product->A != Ao) PetscCall(MatProductReplaceMats(Ao,NULL,NULL,ab->C2)); 10699566063dSJacob Faibussowitsch PetscCall((*ab->C2->ops->productnumeric)(ab->C2)); /* C2 = Ao * B_other */ 1070076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&ab->kh,one,ab->C1_global,one,ab->C2_global,ab->C_global); 1071076ba34aSJunchao Zhang 1072076ba34aSJunchao Zhang /* atb->C1 = Bd^t * ab->C_petsc */ 1073076ba34aSJunchao Zhang atb = mmdata->mmAtB; 107408401ef6SPierre Jolivet PetscCheck(atb->C1->product->B == ab->C_petsc,PETSC_COMM_SELF,PETSC_ERR_PLIB,"In MATPRODUCT_PtAP, internal mat product matrix atb->C1->B has unexpectedly changed"); 10759566063dSJacob Faibussowitsch if (atb->C1->product->A != Bd) PetscCall(MatProductReplaceMats(Bd,NULL,NULL,atb->C1)); 10769566063dSJacob Faibussowitsch PetscCall((*atb->C1->ops->productnumeric)(atb->C1)); 1077076ba34aSJunchao Zhang /* atb->C2 = Bo^t * ab->C_petsc */ 10789566063dSJacob Faibussowitsch if (atb->C2->product->A != Bo) PetscCall(MatProductReplaceMats(Bo,NULL,NULL,atb->C2)); 10799566063dSJacob Faibussowitsch PetscCall((*atb->C2->ops->productnumeric)(atb->C2)); 10809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosReduce(atb->C2,MAT_REUSE_MATRIX,PETSC_FALSE,0/*N*/,MatColIdxKokkosView()/*l2g*/,NULL/*ownerSF*/,atb->sf, 10815f80ce2aSJacob Faibussowitsch atb->abuf,atb->srcrowoffset,atb->dstrowoffset,atb->C2_global)); 1082076ba34aSJunchao Zhang KokkosSparse::spadd_numeric(&atb->kh,one,atb->C1_global,one,atb->C2_global,atb->C_global); 1083076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(atb); 1084076ba34aSJunchao Zhang } 1085076ba34aSJunchao Zhang /* Split C_global to form C */ 10869566063dSJacob Faibussowitsch PetscCall(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_REUSE_MATRIX,mm->C_global,mm->Cdstart)); 1087076ba34aSJunchao Zhang PetscFunctionReturn(0); 1088076ba34aSJunchao Zhang } 1089076ba34aSJunchao Zhang 1090076ba34aSJunchao Zhang PetscErrorCode MatProductSymbolic_MPIAIJKokkos(Mat C) 1091076ba34aSJunchao Zhang { 1092076ba34aSJunchao Zhang Mat A,B; 1093076ba34aSJunchao Zhang Mat_Product *product = C->product; 1094076ba34aSJunchao Zhang MatProductType ptype; 1095076ba34aSJunchao Zhang MatProductData_MPIAIJKokkos *mmdata; 1096076ba34aSJunchao Zhang MatMatStruct *mm = NULL; 1097076ba34aSJunchao Zhang IS glob = NULL; 1098076ba34aSJunchao Zhang const PetscInt *garray; 1099076ba34aSJunchao Zhang PetscInt m,n,M,N,sz; 1100076ba34aSJunchao Zhang ConstMatColIdxKokkosView l2g; /* map local col ids to global ones */ 1101076ba34aSJunchao Zhang 1102076ba34aSJunchao Zhang PetscFunctionBegin; 1103076ba34aSJunchao Zhang MatCheckProduct(C,1); 110428b400f6SJacob Faibussowitsch PetscCheck(!product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty"); 1105076ba34aSJunchao Zhang ptype = product->type; 1106076ba34aSJunchao Zhang A = product->A; 1107076ba34aSJunchao Zhang B = product->B; 1108076ba34aSJunchao Zhang 1109076ba34aSJunchao Zhang switch (ptype) { 1110076ba34aSJunchao Zhang case MATPRODUCT_AB: m = A->rmap->n; n = B->cmap->n; M = A->rmap->N; N = B->cmap->N; break; 1111076ba34aSJunchao Zhang case MATPRODUCT_AtB: m = A->cmap->n; n = B->cmap->n; M = A->cmap->N; N = B->cmap->N; break; 1112076ba34aSJunchao Zhang case MATPRODUCT_PtAP: m = B->cmap->n; n = B->cmap->n; M = B->cmap->N; N = B->cmap->N; break; /* BtAB */ 111398921bdaSJacob Faibussowitsch default: SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]); 1114076ba34aSJunchao Zhang } 1115076ba34aSJunchao Zhang 11169566063dSJacob Faibussowitsch PetscCall(MatSetSizes(C,m,n,M,N)); 11179566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->rmap)); 11189566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(C->cmap)); 11199566063dSJacob Faibussowitsch PetscCall(MatSetType(C,((PetscObject)A)->type_name)); 1120076ba34aSJunchao Zhang 1121076ba34aSJunchao Zhang mmdata = new MatProductData_MPIAIJKokkos(); 1122076ba34aSJunchao Zhang mmdata->reusesym = product->api_user; 1123076ba34aSJunchao Zhang 1124076ba34aSJunchao Zhang if (ptype == MATPRODUCT_AB) { 1125076ba34aSJunchao Zhang mmdata->mmAB = new MatMatStruct_AB(); 11269566063dSJacob Faibussowitsch PetscCall(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,mmdata->mmAB)); 1127076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(mmdata->mmAB); 1128076ba34aSJunchao Zhang } else if (ptype == MATPRODUCT_AtB) { 1129076ba34aSJunchao Zhang mmdata->mmAtB = new MatMatStruct_AtB(); 1130076ba34aSJunchao Zhang auto atb = mmdata->mmAtB; 11319566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetLocalMatMerge(B,MAT_INITIAL_MATRIX,&glob,&atb->B_local)); 11329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(glob,&garray)); 11339566063dSJacob Faibussowitsch PetscCall(ISGetSize(glob,&sz)); 1134076ba34aSJunchao Zhang l2g = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatColIdxKokkosViewHost(garray,sz)); 11359566063dSJacob Faibussowitsch PetscCall(MatProductSymbolic_MPIAIJKokkos_AtB(product,A,atb->B_local,PETSC_TRUE,N,l2g,atb)); 11369566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(glob,&garray)); 11379566063dSJacob Faibussowitsch PetscCall(ISDestroy(&glob)); 1138076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(atb); 1139076ba34aSJunchao Zhang } else if (ptype == MATPRODUCT_PtAP) { /* BtAB */ 1140076ba34aSJunchao Zhang mmdata->mmAB = new MatMatStruct_AB(); /* tmp=A*B */ 1141076ba34aSJunchao Zhang mmdata->mmAtB = new MatMatStruct_AtB(); /* C=B^t*tmp */ 1142076ba34aSJunchao Zhang auto ab = mmdata->mmAB; 1143076ba34aSJunchao Zhang auto atb = mmdata->mmAtB; 11449566063dSJacob Faibussowitsch PetscCall(MatProductSymbolic_MPIAIJKokkos_AB(product,A,B,ab)); 1145076ba34aSJunchao Zhang auto tmp = new Mat_SeqAIJKokkos(ab->C_global); /* Memory will be owned by ab->C_petsc */ 11469566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,tmp,&ab->C_petsc)); 11479566063dSJacob Faibussowitsch PetscCall(MatProductSymbolic_MPIAIJKokkos_AtB(product,B,ab->C_petsc,PETSC_FALSE,N,l2g/*not used*/,atb)); 1148076ba34aSJunchao Zhang mm = static_cast<MatMatStruct*>(atb); 1149076ba34aSJunchao Zhang } 1150076ba34aSJunchao Zhang /* Split the C_global into petsc A, B format */ 11519566063dSJacob Faibussowitsch PetscCall(MatSetMPIAIJKokkosWithGlobalCSRMatrix(C,MAT_INITIAL_MATRIX,mm->C_global,mm->Cdstart)); 1152076ba34aSJunchao Zhang C->product->data = mmdata; 1153076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_MPIAIJKokkos; 1154076ba34aSJunchao Zhang C->ops->productnumeric = MatProductNumeric_MPIAIJKokkos; 1155076ba34aSJunchao Zhang PetscFunctionReturn(0); 1156076ba34aSJunchao Zhang } 1157076ba34aSJunchao Zhang 1158076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatProductSetFromOptions_MPIAIJKokkos(Mat mat) 1159076ba34aSJunchao Zhang { 1160076ba34aSJunchao Zhang Mat_Product *product = mat->product; 1161076ba34aSJunchao Zhang PetscBool match = PETSC_FALSE; 1162076ba34aSJunchao Zhang PetscBool usecpu = PETSC_FALSE; 1163076ba34aSJunchao Zhang 1164076ba34aSJunchao Zhang PetscFunctionBegin; 1165076ba34aSJunchao Zhang MatCheckProduct(mat,1); 1166076ba34aSJunchao Zhang if (!product->A->boundtocpu && !product->B->boundtocpu) { 11679566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,((PetscObject)product->A)->type_name,&match)); 1168076ba34aSJunchao Zhang } 1169076ba34aSJunchao Zhang if (match) { /* we can always fallback to the CPU if requested */ 1170076ba34aSJunchao Zhang switch (product->type) { 1171076ba34aSJunchao Zhang case MATPRODUCT_AB: 1172076ba34aSJunchao Zhang if (product->api_user) { 1173d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatMatMult","Mat"); 11749566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matmatmult_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 1175d0609cedSBarry Smith PetscOptionsEnd(); 1176076ba34aSJunchao Zhang } else { 1177d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AB","Mat"); 11789566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatMatMult",usecpu,&usecpu,NULL)); 1179d0609cedSBarry Smith PetscOptionsEnd(); 1180076ba34aSJunchao Zhang } 1181076ba34aSJunchao Zhang break; 1182076ba34aSJunchao Zhang case MATPRODUCT_AtB: 1183076ba34aSJunchao Zhang if (product->api_user) { 1184d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatTransposeMatMult","Mat"); 11859566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mattransposematmult_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 1186d0609cedSBarry Smith PetscOptionsEnd(); 1187076ba34aSJunchao Zhang } else { 1188d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_AtB","Mat"); 11899566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatTransposeMatMult",usecpu,&usecpu,NULL)); 1190d0609cedSBarry Smith PetscOptionsEnd(); 1191076ba34aSJunchao Zhang } 1192076ba34aSJunchao Zhang break; 1193076ba34aSJunchao Zhang case MATPRODUCT_PtAP: 1194076ba34aSJunchao Zhang if (product->api_user) { 1195d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatPtAP","Mat"); 11969566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-matptap_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 1197d0609cedSBarry Smith PetscOptionsEnd(); 1198076ba34aSJunchao Zhang } else { 1199d0609cedSBarry Smith PetscOptionsBegin(PetscObjectComm((PetscObject)mat),((PetscObject)mat)->prefix,"MatProduct_PtAP","Mat"); 12009566063dSJacob Faibussowitsch PetscCall(PetscOptionsBool("-mat_product_algorithm_backend_cpu","Use CPU code","MatPtAP",usecpu,&usecpu,NULL)); 1201d0609cedSBarry Smith PetscOptionsEnd(); 1202076ba34aSJunchao Zhang } 1203076ba34aSJunchao Zhang break; 1204076ba34aSJunchao Zhang default: 1205076ba34aSJunchao Zhang break; 1206076ba34aSJunchao Zhang } 1207076ba34aSJunchao Zhang match = (PetscBool)!usecpu; 1208076ba34aSJunchao Zhang } 1209076ba34aSJunchao Zhang if (match) { 1210076ba34aSJunchao Zhang switch (product->type) { 1211076ba34aSJunchao Zhang case MATPRODUCT_AB: 1212076ba34aSJunchao Zhang case MATPRODUCT_AtB: 1213076ba34aSJunchao Zhang case MATPRODUCT_PtAP: 1214076ba34aSJunchao Zhang mat->ops->productsymbolic = MatProductSymbolic_MPIAIJKokkos; 1215076ba34aSJunchao Zhang break; 1216076ba34aSJunchao Zhang default: 1217076ba34aSJunchao Zhang break; 1218076ba34aSJunchao Zhang } 1219076ba34aSJunchao Zhang } 1220076ba34aSJunchao Zhang /* fallback to MPIAIJ ops */ 1221076ba34aSJunchao Zhang if (!mat->ops->productsymbolic) { 12229566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_MPIAIJ(mat)); 1223076ba34aSJunchao Zhang } 1224076ba34aSJunchao Zhang PetscFunctionReturn(0); 1225076ba34aSJunchao Zhang } 1226076ba34aSJunchao Zhang 1227*e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 122842550becSJunchao Zhang { 1229394ed5ebSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 1230cbc6b225SStefano Zampini Mat_MPIAIJKokkos *mpikok; 123142550becSJunchao Zhang 123242550becSJunchao Zhang PetscFunctionBegin; 12339566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 1234cbc6b225SStefano Zampini mat->preallocated = PETSC_TRUE; 12359566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(mat,MAT_FINAL_ASSEMBLY)); 12369566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(mat,MAT_FINAL_ASSEMBLY)); 12379566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(mat)); 1238cbc6b225SStefano Zampini mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 1239cbc6b225SStefano Zampini delete mpikok; 1240394ed5ebSJunchao Zhang mpiaij->spptr = new Mat_MPIAIJKokkos(mpiaij); 124142550becSJunchao Zhang PetscFunctionReturn(0); 124242550becSJunchao Zhang } 124342550becSJunchao Zhang 124442550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJKokkos(Mat mat,const PetscScalar v[],InsertMode imode) 124542550becSJunchao Zhang { 1246394ed5ebSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 124742550becSJunchao Zhang Mat_MPIAIJKokkos *mpikok = static_cast<Mat_MPIAIJKokkos*>(mpiaij->spptr); 124842550becSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 1249158ec288SJunchao Zhang PetscCount Annz = mpiaij->Annz,Annz2 = mpiaij->Annz2,Bnnz = mpiaij->Bnnz,Bnnz2 = mpiaij->Bnnz2; 125042550becSJunchao Zhang MatScalarKokkosView Aa,Ba; 1251394ed5ebSJunchao Zhang MatScalarKokkosView v1; 125242550becSJunchao Zhang MatScalarKokkosView& vsend = mpikok->sendbuf_d; 125342550becSJunchao Zhang const MatScalarKokkosView& v2 = mpikok->recvbuf_d; 1254158ec288SJunchao Zhang const PetscCountKokkosView& Ajmap1 = mpikok->Ajmap1_d,Ajmap2 = mpikok->Ajmap2_d,Aimap2 = mpikok->Aimap2_d; 1255158ec288SJunchao Zhang const PetscCountKokkosView& Bjmap1 = mpikok->Bjmap1_d,Bjmap2 = mpikok->Bjmap2_d,Bimap2 = mpikok->Bimap2_d; 1256394ed5ebSJunchao Zhang const PetscCountKokkosView& Aperm1 = mpikok->Aperm1_d,Aperm2 = mpikok->Aperm2_d,Bperm1 = mpikok->Bperm1_d,Bperm2 = mpikok->Bperm2_d; 1257394ed5ebSJunchao Zhang const PetscCountKokkosView& Cperm1 = mpikok->Cperm1_d; 125842550becSJunchao Zhang PetscMemType memtype; 125942550becSJunchao Zhang 126042550becSJunchao Zhang PetscFunctionBegin; 12619566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); /* Return PETSC_MEMTYPE_HOST when v is NULL */ 126242550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device if any */ 1263394ed5ebSJunchao Zhang v1 = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),MatScalarKokkosViewHost((PetscScalar*)v,mpiaij->coo_n)); 126442550becSJunchao Zhang } else { 1265394ed5ebSJunchao Zhang v1 = MatScalarKokkosView((PetscScalar*)v,mpiaij->coo_n); /* Directly use v[]'s memory */ 126642550becSJunchao Zhang } 126742550becSJunchao Zhang 126842550becSJunchao Zhang if (imode == INSERT_VALUES) { 12699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 12709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetKokkosViewWrite(B,&Ba)); 1271394ed5ebSJunchao Zhang } else { 12729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 12739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJGetKokkosView(B,&Ba)); 127442550becSJunchao Zhang } 127542550becSJunchao Zhang 127642550becSJunchao Zhang /* Pack entries to be sent to remote */ 1277394ed5ebSJunchao Zhang Kokkos::parallel_for(vsend.extent(0),KOKKOS_LAMBDA(const PetscCount i) {vsend(i) = v1(Cperm1(i));}); 127842550becSJunchao Zhang 127942550becSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 12809566063dSJacob Faibussowitsch PetscCall(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_KOKKOS,vsend.data(),PETSC_MEMTYPE_KOKKOS,v2.data(),MPI_REPLACE)); 1281158ec288SJunchao Zhang /* Add local entries to A and B in one kernel */ 1282158ec288SJunchao Zhang Kokkos::parallel_for(Annz+Bnnz,KOKKOS_LAMBDA(PetscCount i) { 1283158ec288SJunchao Zhang PetscScalar sum = 0.0; 1284158ec288SJunchao Zhang if (i<Annz) { 1285158ec288SJunchao Zhang for (PetscCount k=Ajmap1(i); k<Ajmap1(i+1); k++) sum += v1(Aperm1(k)); 1286ac38520cSJunchao Zhang Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1287158ec288SJunchao Zhang } else { 1288158ec288SJunchao Zhang i -= Annz; 1289158ec288SJunchao Zhang for (PetscCount k=Bjmap1(i); k<Bjmap1(i+1); k++) sum += v1(Bperm1(k)); 1290ac38520cSJunchao Zhang Ba(i) = (imode == INSERT_VALUES? 0.0 : Ba(i)) + sum; 1291158ec288SJunchao Zhang } 1292158ec288SJunchao Zhang }); 12939566063dSJacob Faibussowitsch PetscCall(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend.data(),v2.data(),MPI_REPLACE)); 129442550becSJunchao Zhang 1295158ec288SJunchao Zhang /* Add received remote entries to A and B in one kernel */ 1296158ec288SJunchao Zhang Kokkos::parallel_for(Annz2+Bnnz2,KOKKOS_LAMBDA(PetscCount i) { 1297158ec288SJunchao Zhang if (i < Annz2) { 1298158ec288SJunchao Zhang for (PetscCount k=Ajmap2(i); k<Ajmap2(i+1); k++) Aa(Aimap2(i)) += v2(Aperm2(k)); 1299158ec288SJunchao Zhang } else { 1300158ec288SJunchao Zhang i -= Annz2; 1301158ec288SJunchao Zhang for (PetscCount k=Bjmap2(i); k<Bjmap2(i+1); k++) Ba(Bimap2(i)) += v2(Bperm2(k)); 1302158ec288SJunchao Zhang } 1303158ec288SJunchao Zhang }); 130442550becSJunchao Zhang 1305394ed5ebSJunchao Zhang if (imode == INSERT_VALUES) { 13069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); /* Increase A & B's state etc. */ 13079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreKokkosViewWrite(B,&Ba)); 1308394ed5ebSJunchao Zhang } else { 13099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 13109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJRestoreKokkosView(B,&Ba)); 1311394ed5ebSJunchao Zhang } 131242550becSJunchao Zhang PetscFunctionReturn(0); 131342550becSJunchao Zhang } 131442550becSJunchao Zhang 1315076ba34aSJunchao Zhang PetscErrorCode MatDestroy_MPIAIJKokkos(Mat A) 1316076ba34aSJunchao Zhang { 131742550becSJunchao Zhang Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1318076ba34aSJunchao Zhang 1319076ba34aSJunchao Zhang PetscFunctionBegin; 13209566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 13219566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 13229566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C", NULL)); 13239566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL)); 132442550becSJunchao Zhang delete (Mat_MPIAIJKokkos*)mpiaij->spptr; 13259566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A)); 1326076ba34aSJunchao Zhang PetscFunctionReturn(0); 1327076ba34aSJunchao Zhang } 1328076ba34aSJunchao Zhang 13298c3ff71bSJunchao Zhang PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 13308c3ff71bSJunchao Zhang { 13318c3ff71bSJunchao Zhang Mat B; 1332076ba34aSJunchao Zhang Mat_MPIAIJ *a; 13338c3ff71bSJunchao Zhang 13348c3ff71bSJunchao Zhang PetscFunctionBegin; 13358c3ff71bSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 13369566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); 13378c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 13389566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); 13398c3ff71bSJunchao Zhang } 13408c3ff71bSJunchao Zhang B = *newmat; 13418c3ff71bSJunchao Zhang 13426f3d89d0SStefano Zampini B->boundtocpu = PETSC_FALSE; 13439566063dSJacob Faibussowitsch PetscCall(PetscFree(B->defaultvectype)); 13449566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS,&B->defaultvectype)); 13459566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)B,MATMPIAIJKOKKOS)); 13468c3ff71bSJunchao Zhang 1347076ba34aSJunchao Zhang a = static_cast<Mat_MPIAIJ*>(A->data); 13489566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A,MATSEQAIJKOKKOS)); 13499566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B,MATSEQAIJKOKKOS)); 13509566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec,VECSEQKOKKOS)); 1351076ba34aSJunchao Zhang 13528c3ff71bSJunchao Zhang B->ops->assemblyend = MatAssemblyEnd_MPIAIJKokkos; 13538c3ff71bSJunchao Zhang B->ops->mult = MatMult_MPIAIJKokkos; 13548c3ff71bSJunchao Zhang B->ops->multadd = MatMultAdd_MPIAIJKokkos; 13558c3ff71bSJunchao Zhang B->ops->multtranspose = MatMultTranspose_MPIAIJKokkos; 1356076ba34aSJunchao Zhang B->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJKokkos; 1357076ba34aSJunchao Zhang B->ops->destroy = MatDestroy_MPIAIJKokkos; 13588c3ff71bSJunchao Zhang 13599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJKokkos)); 13609566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJKokkos)); 13619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJKokkos)); 13629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)B,"MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJKokkos)); 13638c3ff71bSJunchao Zhang PetscFunctionReturn(0); 13648c3ff71bSJunchao Zhang } 13653f3ba80aSJunchao Zhang /*MC 13663f3ba80aSJunchao Zhang MATSMPIAIJKOKKOS - MATAIJKOKKOS = "(mpi)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 13678c3ff71bSJunchao Zhang 13683f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 13693f3ba80aSJunchao Zhang 13703f3ba80aSJunchao Zhang Options Database Keys: 13713f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 13723f3ba80aSJunchao Zhang 13733f3ba80aSJunchao Zhang Level: beginner 13743f3ba80aSJunchao Zhang 1375db781477SPatrick Sanan .seealso: `MatCreateAIJKokkos()`, `MATSEQAIJKOKKOS` 13763f3ba80aSJunchao Zhang M*/ 13778c3ff71bSJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJKokkos(Mat A) 13788c3ff71bSJunchao Zhang { 13798c3ff71bSJunchao Zhang PetscFunctionBegin; 13809566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 13819566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A)); 13829566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJKokkos(A,MATMPIAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 13838c3ff71bSJunchao Zhang PetscFunctionReturn(0); 13848c3ff71bSJunchao Zhang } 13858c3ff71bSJunchao Zhang 13868c3ff71bSJunchao Zhang /*@C 13878c3ff71bSJunchao Zhang MatCreateAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 13888c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately pushed down 13898c3ff71bSJunchao Zhang to Kokkos for calculations. For good matrix 13908c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 13918c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 13928c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 13938c3ff71bSJunchao Zhang 13948c3ff71bSJunchao Zhang Collective 13958c3ff71bSJunchao Zhang 13968c3ff71bSJunchao Zhang Input Parameters: 13978c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 13988c3ff71bSJunchao Zhang . m - number of rows 13998c3ff71bSJunchao Zhang . n - number of columns 14008c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 14018c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 14028c3ff71bSJunchao Zhang (possibly different for each row) or NULL 14038c3ff71bSJunchao Zhang 14048c3ff71bSJunchao Zhang Output Parameter: 14058c3ff71bSJunchao Zhang . A - the matrix 14068c3ff71bSJunchao Zhang 14078c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 14088c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradigm instead of this routine directly. 14098c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 14108c3ff71bSJunchao Zhang 14118c3ff71bSJunchao Zhang Notes: 14128c3ff71bSJunchao Zhang If nnz is given then nz is ignored 14138c3ff71bSJunchao Zhang 14148c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 14158c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 14168c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 14178c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 14188c3ff71bSJunchao Zhang 14198c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 14208c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 14218c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 14228c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 14238c3ff71bSJunchao Zhang 14248c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 14258c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 14268c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 14278c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 14288c3ff71bSJunchao Zhang 14298c3ff71bSJunchao Zhang Level: intermediate 14308c3ff71bSJunchao Zhang 1431db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJKOKKOS`, `MATAIJKokkos` 14328c3ff71bSJunchao Zhang @*/ 14338c3ff71bSJunchao Zhang PetscErrorCode MatCreateAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 14348c3ff71bSJunchao Zhang { 14358c3ff71bSJunchao Zhang PetscMPIInt size; 14368c3ff71bSJunchao Zhang 14378c3ff71bSJunchao Zhang PetscFunctionBegin; 14389566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 14399566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,M,N)); 14409566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm,&size)); 14418c3ff71bSJunchao Zhang if (size > 1) { 14429566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATMPIAIJKOKKOS)); 14439566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 14448c3ff71bSJunchao Zhang } else { 14459566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 14469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 14478c3ff71bSJunchao Zhang } 14488c3ff71bSJunchao Zhang PetscFunctionReturn(0); 14498c3ff71bSJunchao Zhang } 14508c3ff71bSJunchao Zhang 1451a587d139SMark // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 1452042217e8SBarry Smith PetscErrorCode MatKokkosGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 1453a587d139SMark { 1454a587d139SMark PetscMPIInt size,rank; 1455a587d139SMark MPI_Comm comm; 1456042217e8SBarry Smith PetscSplitCSRDataStructure d_mat=NULL; 1457a587d139SMark 1458a587d139SMark PetscFunctionBegin; 14599566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)A,&comm)); 14609566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm,&size)); 14619566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(comm,&rank)); 1462a587d139SMark if (size == 1) { 14639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetDeviceMat(A,&d_mat)); 14649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); /* Since we are going to modify matrix values on device */ 1465a587d139SMark } else { 1466a587d139SMark Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 14679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetDeviceMat(aij->A,&d_mat)); 14689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(aij->A)); 14699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(aij->B)); 14702c71b3e2SJacob Faibussowitsch PetscCheck(A->nooffprocentries || aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 1471a587d139SMark } 1472a587d139SMark // act like MatSetValues because not called on host 1473a587d139SMark if (A->assembled) { 1474a587d139SMark if (A->was_assembled) { 14759566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Assemble more than once already\n")); 1476a587d139SMark } 1477a587d139SMark A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 1478a587d139SMark } else { 14799566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Warning !assemble ??? assembled=%" PetscInt_FMT "\n",A->assembled)); 1480a587d139SMark } 1481a587d139SMark if (!d_mat) { 1482042217e8SBarry Smith struct _n_SplitCSRMat h_mat; /* host container */ 1483a587d139SMark Mat_SeqAIJKokkos *aijkokA; 1484a587d139SMark Mat_SeqAIJ *jaca; 1485a587d139SMark PetscInt n = A->rmap->n, nnz; 1486a587d139SMark Mat Amat; 1487042217e8SBarry Smith PetscInt *colmap; 1488042217e8SBarry Smith 1489042217e8SBarry Smith /* create and copy h_mat */ 149049b994a9SMark Adams h_mat.M = A->cmap->N; // use for debug build 14919566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Create device matrix in Kokkos\n")); 1492a587d139SMark if (size == 1) { 1493a587d139SMark Amat = A; 1494a587d139SMark jaca = (Mat_SeqAIJ*)A->data; 1495a587d139SMark h_mat.rstart = 0; h_mat.rend = A->rmap->n; 1496a587d139SMark h_mat.cstart = 0; h_mat.cend = A->cmap->n; 1497a587d139SMark h_mat.offdiag.i = h_mat.offdiag.j = NULL; 1498a587d139SMark h_mat.offdiag.a = NULL; 1499a587d139SMark aijkokA = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1500a587d139SMark } else { 1501a587d139SMark Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 1502a587d139SMark Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 1503a587d139SMark PetscInt ii; 1504a587d139SMark Mat_SeqAIJKokkos *aijkokB; 1505042217e8SBarry Smith 1506a587d139SMark Amat = aij->A; 1507a587d139SMark aijkokA = static_cast<Mat_SeqAIJKokkos*>(aij->A->spptr); 1508a587d139SMark aijkokB = static_cast<Mat_SeqAIJKokkos*>(aij->B->spptr); 1509a587d139SMark jaca = (Mat_SeqAIJ*)aij->A->data; 151008401ef6SPierre Jolivet PetscCheck(!aij->B->cmap->n || aij->garray,comm,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 151108401ef6SPierre Jolivet PetscCheck(aij->B->rmap->n == aij->A->rmap->n,comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 1512a587d139SMark aij->donotstash = PETSC_TRUE; 1513a587d139SMark aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 1514a5b23f4aSJose E. Roman jaca->nonew = jacb->nonew = PETSC_TRUE; // no more disassembly 15159566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(A->cmap->N,&colmap)); 15169566063dSJacob Faibussowitsch PetscCall(PetscLogObjectMemory((PetscObject)A,(A->cmap->N)*sizeof(PetscInt))); 1517042217e8SBarry Smith for (ii=0; ii<aij->B->cmap->n; ii++) colmap[aij->garray[ii]] = ii+1; 1518a587d139SMark // allocate B copy data 1519a587d139SMark h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 1520a587d139SMark h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 1521a587d139SMark nnz = jacb->i[n]; 1522a587d139SMark if (jacb->compressedrow.use) { 1523a587d139SMark const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_i_k (jacb->i,n+1); 1524300d22a6SJunchao Zhang aijkokB->i_uncompressed_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_i_k)); 1525300d22a6SJunchao Zhang Kokkos::deep_copy (aijkokB->i_uncompressed_d, h_i_k); 1526300d22a6SJunchao Zhang h_mat.offdiag.i = aijkokB->i_uncompressed_d.data(); 1527a587d139SMark } else { 152899551766SMark Adams h_mat.offdiag.i = aijkokB->i_device_data(); 1529a587d139SMark } 153099551766SMark Adams h_mat.offdiag.j = aijkokB->j_device_data(); 1531076ba34aSJunchao Zhang h_mat.offdiag.a = aijkokB->a_device_data(); 1532a587d139SMark { 1533042217e8SBarry Smith Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_colmap_k (colmap,A->cmap->N); 1534300d22a6SJunchao Zhang aijkokB->colmap_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_colmap_k)); 1535300d22a6SJunchao Zhang Kokkos::deep_copy (aijkokB->colmap_d, h_colmap_k); 1536300d22a6SJunchao Zhang h_mat.colmap = aijkokB->colmap_d.data(); 15379566063dSJacob Faibussowitsch PetscCall(PetscFree(colmap)); 1538a587d139SMark } 1539a587d139SMark h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 1540a587d139SMark h_mat.offdiag.n = n; 1541a587d139SMark } 1542a587d139SMark // allocate A copy data 1543a587d139SMark nnz = jaca->i[n]; 1544a587d139SMark h_mat.diag.n = n; 1545a587d139SMark h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 15469566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_rank(comm,&h_mat.rank)); 1547aed4548fSBarry Smith PetscCheck(!jaca->compressedrow.use,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A does not suppport compressed row (todo)"); 154899551766SMark Adams h_mat.diag.i = aijkokA->i_device_data(); 154999551766SMark Adams h_mat.diag.j = aijkokA->j_device_data(); 1550076ba34aSJunchao Zhang h_mat.diag.a = aijkokA->a_device_data(); 1551a587d139SMark // copy pointers and metdata to device 15529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSetDeviceMat(Amat,&h_mat)); 15539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetDeviceMat(Amat,&d_mat)); 15549566063dSJacob Faibussowitsch PetscCall(PetscInfo(A,"Create device Mat n=%" PetscInt_FMT " nnz=%" PetscInt_FMT "\n",h_mat.diag.n, nnz)); 1555a587d139SMark } 1556a587d139SMark *B = d_mat; // return it, set it in Mat, and set it up 1557a587d139SMark A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 1558a587d139SMark PetscFunctionReturn(0); 1559a587d139SMark } 1560076ba34aSJunchao Zhang 1561076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetOffloadMask(Mat A, const char **mask) 1562076ba34aSJunchao Zhang { 1563076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1564076ba34aSJunchao Zhang 1565076ba34aSJunchao Zhang PetscFunctionBegin; 1566076ba34aSJunchao Zhang if (!aijkok) *mask = "AIJKOK_UNALLOCATED"; 1567076ba34aSJunchao Zhang else if (aijkok->a_dual.need_sync_host()) *mask = "PETSC_OFFLOAD_GPU"; 1568076ba34aSJunchao Zhang else if (aijkok->a_dual.need_sync_device()) *mask = "PETSC_OFFLOAD_CPU"; 1569076ba34aSJunchao Zhang else *mask = "PETSC_OFFLOAD_BOTH"; 1570076ba34aSJunchao Zhang PetscFunctionReturn(0); 1571076ba34aSJunchao Zhang } 1572076ba34aSJunchao Zhang 1573076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatAIJKokkosPrintOffloadMask(Mat A) 1574076ba34aSJunchao Zhang { 1575076ba34aSJunchao Zhang PetscMPIInt size; 1576076ba34aSJunchao Zhang Mat Ad,Ao; 1577076ba34aSJunchao Zhang const char *amask,*bmask; 1578076ba34aSJunchao Zhang 1579076ba34aSJunchao Zhang PetscFunctionBegin; 15809566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 1581076ba34aSJunchao Zhang 1582076ba34aSJunchao Zhang if (size == 1) { 15839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetOffloadMask(A,&amask)); 15849566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%s\n",amask)); 1585076ba34aSJunchao Zhang } else { 1586076ba34aSJunchao Zhang Ad = ((Mat_MPIAIJ*)A->data)->A; 1587076ba34aSJunchao Zhang Ao = ((Mat_MPIAIJ*)A->data)->B; 15889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetOffloadMask(Ad,&amask)); 15899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGetOffloadMask(Ao,&bmask)); 15909566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"Diag : Off-diag = %s : %s\n",amask,bmask)); 1591076ba34aSJunchao Zhang } 1592076ba34aSJunchao Zhang PetscFunctionReturn(0); 1593076ba34aSJunchao Zhang } 1594