#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include <../src/mat/impls/aij/seq/aij.h> #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). In the latter case, it is important to set a_dual's sync state correctly. */ static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) { PetscErrorCode ierr; Mat_SeqAIJ *aijseq; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); aijseq = static_cast(A->data); aijkok = static_cast(A->spptr); /* If aijkok does not exist, we just copy i, j to device. If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. In both cases, we build a new aijkok structure. */ if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ delete aijkok; aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); A->spptr = aijkok; } if (aijkok && aijkok->device_mat_d.data()) { A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this } PetscFunctionReturn(0); } /* Sync CSR data to device if not yet */ PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); if (aijkok->a_dual.need_sync_device()) { aijkok->a_dual.sync_device(); aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ aijkok->hermitian_updated = PETSC_FALSE; } PetscFunctionReturn(0); } /* Mark the CSR data on device as modified */ static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); aijkok->a_dual.clear_sync_state(); aijkok->a_dual.modify_device(); aijkok->transpose_updated = PETSC_FALSE; aijkok->hermitian_updated = PETSC_FALSE; ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; PetscCheckTypeName(A,MATSEQAIJKOKKOS); /* We do not expect one needs factors on host */ if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); aijkok->a_dual.sync_host(); PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok) { aijkok->a_dual.sync_host(); *array = aijkok->a_dual.view_host().data(); } else { /* Happens when calling MatSetValues on a newly created matrix */ *array = static_cast(A->data)->a; } PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok) aijkok->a_dual.modify_host(); PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok) { aijkok->a_dual.sync_host(); *array = aijkok->a_dual.view_host().data(); } else { *array = static_cast(A->data)->a; } PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) { PetscFunctionBegin; *array = NULL; PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok) { *array = aijkok->a_dual.view_host().data(); } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ *array = static_cast(A->data)->a; } PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok) { aijkok->a_dual.clear_sync_state(); aijkok->a_dual.modify_host(); } PetscFunctionReturn(0); } // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); Kokkos::View h_mat_k(h_mat); PetscFunctionBegin; if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); PetscFunctionReturn(0); } // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (aijkok && aijkok->device_mat_d.data()) { *d_mat = aijkok->device_mat_d.data(); } else { PetscErrorCode ierr; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) *d_mat = NULL; } PetscFunctionReturn(0); } /* Generate the transpose on device and cache it internally */ static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ CHKERRCXX(aijkok->a_dual.sync_device()); CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); aijkok->transpose_updated = PETSC_TRUE; } *csrmatT = &aijkok->csrmatT; PetscFunctionReturn(0); } /* Generate the Hermitian on device and cache it internally */ static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) { Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ CHKERRCXX(aijkok->a_dual.sync_device()); CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); #if defined(PETSC_USE_COMPLEX) const auto& a = aijkok->csrmatH.values; Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); #endif aijkok->hermitian_updated = PETSC_TRUE; } *csrmatH = &aijkok->csrmatH; PetscFunctionReturn(0); } /* y = A x */ static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; ConstPetscScalarKokkosView xv; PetscScalarKokkosView yv; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); aijkok = static_cast(A->spptr); KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } /* y = A^T x */ static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; const char *mode; ConstPetscScalarKokkosView xv; PetscScalarKokkosView yv; KokkosCsrMatrix *csrmat; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); if (A->form_explicit_transpose) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); mode = "N"; } else { aijkok = static_cast(A->spptr); csrmat = &aijkok->csrmat; mode = "T"; } KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } /* y = A^H x */ static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; const char *mode; ConstPetscScalarKokkosView xv; PetscScalarKokkosView yv; KokkosCsrMatrix *csrmat; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); if (A->form_explicit_transpose) { ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); mode = "N"; } else { aijkok = static_cast(A->spptr); csrmat = &aijkok->csrmat; mode = "C"; } KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } /* z = A x + y */ static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; ConstPetscScalarKokkosView xv,yv; PetscScalarKokkosView zv; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); if (zz != yy) Kokkos::deep_copy(zv,yv); aijkok = static_cast(A->spptr); KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } /* z = A^T x + y */ static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; const char *mode; ConstPetscScalarKokkosView xv,yv; PetscScalarKokkosView zv; KokkosCsrMatrix *csrmat; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); if (zz != yy) Kokkos::deep_copy(zv,yv); if (A->form_explicit_transpose) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); mode = "N"; } else { aijkok = static_cast(A->spptr); csrmat = &aijkok->csrmat; mode = "T"; } KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } /* z = A^H x + y */ static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; const char *mode; ConstPetscScalarKokkosView xv,yv; PetscScalarKokkosView zv; KokkosCsrMatrix *csrmat; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); if (zz != yy) Kokkos::deep_copy(zv,yv); if (A->form_explicit_transpose) { ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); mode = "N"; } else { aijkok = static_cast(A->spptr); csrmat = &aijkok->csrmat; mode = "C"; } KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); PetscFunctionReturn(0); } PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr); PetscFunctionBegin; switch (op) { case MAT_FORM_EXPLICIT_TRANSPOSE: /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} A->form_explicit_transpose = flg; break; default: ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); break; } PetscFunctionReturn(0); } /* Depending on reuse, either build a new mat, or use the existing mat */ PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) { PetscErrorCode ierr; Mat_SeqAIJ *aseq; PetscFunctionBegin; ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); aseq = static_cast(A->data); if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); } } PetscFunctionReturn(0); } /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. */ static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) { PetscErrorCode ierr; Mat_SeqAIJ *bseq; Mat_SeqAIJKokkos *akok = static_cast(A->spptr),*bkok; Mat mat; PetscFunctionBegin; /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); mat = *B; if (A->assembled) { bseq = static_cast(mat->data); bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ /* Now copy values to B if needed */ if (dupOption == MAT_COPY_VALUES) { if (akok->a_dual.need_sync_device()) { Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); bkok->a_dual.modify_host(); } else { /* If device has the latest data, we only copy data on device */ Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); bkok->a_dual.modify_device(); } } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ bkok->a_dual.modify_host(); } mat->spptr = bkok; } ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) { PetscErrorCode ierr; Mat At; KokkosCsrMatrix *internT,*csrmatT; Mat_SeqAIJKokkos *atkok,*bkok; PetscFunctionBegin; ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); if (reuse == MAT_INITIAL_MATRIX) *B = At; else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ if ((*B)->assembled) { bkok = static_cast((*B)->spptr); CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ Mat_SeqAIJ *bseq = static_cast((*B)->data); MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); } PetscFunctionReturn(0); } static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; if (A->factortype == MAT_FACTOR_NONE) { aijkok = static_cast(A->spptr); if (aijkok) { if (aijkok->device_mat_d.data()) { delete aijkok->colmap_d; delete aijkok->i_uncompressed_d; } if (aijkok->diag_d) delete aijkok->diag_d; } delete aijkok; } else { delete static_cast(A->spptr); } ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); A->spptr = NULL; ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); PetscFunctionReturn(0); } PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) { PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) { PetscErrorCode ierr; Mat_SeqAIJ *a,*b; Mat_SeqAIJKokkos *akok,*bkok,*ckok; MatScalarKokkosView aa,ba,ca; MatRowMapKokkosView ai,bi,ci; MatColIdxKokkosView aj,bj,cj; PetscInt m,n,nnz,aN; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidHeaderSpecific(B,MAT_CLASSID,2); PetscValidPointer(C,4); PetscCheckTypeName(A,MATSEQAIJKOKKOS); PetscCheckTypeName(B,MATSEQAIJKOKKOS); if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); a = static_cast(A->data); b = static_cast(B->data); akok = static_cast(A->spptr); bkok = static_cast(B->spptr); aa = akok->a_dual.view_device(); ai = akok->i_dual.view_device(); ba = bkok->a_dual.view_device(); bi = bkok->i_dual.view_device(); m = A->rmap->n; /* M, N and nnz of C */ n = A->cmap->n + B->cmap->n; nnz = a->nz + b->nz; aN = A->cmap->n; /* N of A */ if (reuse == MAT_INITIAL_MATRIX) { aj = akok->j_dual.view_device(); bj = bkok->j_dual.view_device(); auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); ca = ca_dual.view_device(); ci = ci_dual.view_device(); cj = cj_dual.view_device(); /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { PetscInt i = t.league_rank(); /* row i */ PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ ci(i) = coffset; if (i == m-1) ci(m) = ai(m) + bi(m); }); Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { if (k < alen) { ca(coffset+k) = aa(ai(i)+k); cj(coffset+k) = aj(ai(i)+k); } else { ca(coffset+k) = ba(bi(i)+k-alen); cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ } }); }); ca_dual.modify_device(); ci_dual.modify_device(); cj_dual.modify_device(); CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); } else if (reuse == MAT_REUSE_MATRIX) { PetscValidHeaderSpecific(*C,MAT_CLASSID,4); PetscCheckTypeName(*C,MATSEQAIJKOKKOS); ckok = static_cast((*C)->spptr); ca = ckok->a_dual.view_device(); ci = ckok->i_dual.view_device(); Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { PetscInt i = t.league_rank(); /* row i */ PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); else ca(ci(i)+k) = ba(bi(i)+k-alen); }); }); ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); } PetscFunctionReturn(0); } static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) { PetscFunctionBegin; delete static_cast(pdata); PetscFunctionReturn(0); } static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) { PetscErrorCode ierr; Mat_Product *product = C->product; Mat A,B; bool transA,transB; /* use bool, since KK needs this type */ Mat_SeqAIJKokkos *akok,*bkok,*ckok; Mat_SeqAIJ *c; MatProductData_SeqAIJKokkos *pdata; KokkosCsrMatrix *csrmatA,*csrmatB; PetscFunctionBegin; MatCheckProduct(C,1); if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); pdata = static_cast(C->product->data); if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ PetscFunctionReturn(0); } switch (product->type) { case MATPRODUCT_AB: transA = false; transB = false; break; case MATPRODUCT_AtB: transA = true; transB = false; break; case MATPRODUCT_ABt: transA = false; transB = true; break; default: SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); } A = product->A; B = product->B; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); akok = static_cast(A->spptr); bkok = static_cast(B->spptr); ckok = static_cast(C->spptr); if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); csrmatA = &akok->csrmat; csrmatB = &bkok->csrmat; /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ if (transA) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); transA = false; } if (transB) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); transB = false; } CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); /* shorter version of MatAssemblyEnd_SeqAIJ */ c = (Mat_SeqAIJ*)C->data; ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); c->reallocs = 0; C->info.mallocs = 0; C->info.nz_unneeded = 0; C->assembled = C->was_assembled = PETSC_TRUE; C->num_ass++; PetscFunctionReturn(0); } static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) { PetscErrorCode ierr; Mat_Product *product = C->product; MatProductType ptype; Mat A,B; bool transA,transB; Mat_SeqAIJKokkos *akok,*bkok,*ckok; MatProductData_SeqAIJKokkos *pdata; MPI_Comm comm; KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; PetscFunctionBegin; MatCheckProduct(C,1); ierr = PetscObjectGetComm((PetscObject)C,&comm); if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); A = product->A; B = product->B; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); akok = static_cast(A->spptr); bkok = static_cast(B->spptr); csrmatA = &akok->csrmat; csrmatB = &bkok->csrmat; ptype = product->type; switch (ptype) { case MATPRODUCT_AB: transA = false; transB = false; break; case MATPRODUCT_AtB: transA = true; transB = false; break; case MATPRODUCT_ABt: transA = false; transB = true; break; default: SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); } product->data = pdata = new MatProductData_SeqAIJKokkos(); pdata->kh.set_team_work_size(16); pdata->kh.set_dynamic_scheduling(true); pdata->reusesym = product->api_user; /* TODO: add command line options to select spgemm algorithms */ auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; #if defined(PETSC_HAVE_CUDA) #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; #endif #endif pdata->kh.create_spgemm_handle(spgemm_alg); /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ if (transA) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); transA = false; } if (transB) { ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); transB = false; } CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before calling new Mat_SeqAIJKokkos(). TODO: Remove the fake spgemm_numeric() after KK fixed this problem. */ CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; PetscFunctionReturn(0); } /* handles sparse matrix matrix ops */ static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) { PetscErrorCode ierr; Mat_Product *product = mat->product; PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; PetscFunctionBegin; MatCheckProduct(mat,1); ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); if (product->type == MATPRODUCT_ABC) { ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); } if (Biskok && Ciskok) { switch (product->type) { case MATPRODUCT_AB: case MATPRODUCT_AtB: case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; break; case MATPRODUCT_PtAP: case MATPRODUCT_RARt: case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; default: break; } } else { /* fallback for AIJ */ ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); } PetscFunctionReturn(0); } static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); aijkok = static_cast(A->spptr); KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); ierr = WaitForKokkos();CHKERRQ(ierr); ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; aijkok = static_cast(A->spptr); if (aijkok) { /* Only zero the device if data is already there */ KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); } else { /* Might be preallocated but not assembled */ ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); } PetscFunctionReturn(0); } /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); aijkok = static_cast(A->spptr); *kv = aijkok->a_dual.view_device(); PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) { PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); aijkok = static_cast(A->spptr); *kv = aijkok->a_dual.view_device(); PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) { PetscErrorCode ierr; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) { Mat_SeqAIJKokkos *aijkok; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); aijkok = static_cast(A->spptr); *kv = aijkok->a_dual.view_device(); PetscFunctionReturn(0); } PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) { PetscErrorCode ierr; PetscFunctionBegin; PetscValidHeaderSpecific(A,MAT_CLASSID,1); PetscValidPointer(kv,2); PetscCheckTypeName(A,MATSEQAIJKOKKOS); ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Computes Y += alpha X */ static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) { PetscErrorCode ierr; Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; Mat_SeqAIJKokkos *xkok,*ykok,*zkok; ConstMatScalarKokkosView Xa; MatScalarKokkosView Ya; PetscFunctionBegin; PetscCheckTypeName(Y,MATSEQAIJKOKKOS); PetscCheckTypeName(X,MATSEQAIJKOKKOS); ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ PetscBool e; ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); if (e) { ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); if (e) pattern = SAME_NONZERO_PATTERN; } } /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. */ ykok = static_cast(Y->spptr); xkok = static_cast(X->spptr); Xa = xkok->a_dual.view_device(); Ya = ykok->a_dual.view_device(); if (pattern == SAME_NONZERO_PATTERN) { KokkosBlas::axpy(alpha,Xa,Ya); ierr = MatSeqAIJKokkosModifyDevice(Y); } else if (pattern == SUBSET_NONZERO_PATTERN) { MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { PetscInt i = t.league_rank(); /* row i */ Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ PetscInt p,q = Yi(i); for (p=Xi(i); pcsrmat,ykok->csrmat,zcsr); KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); zkok = new Mat_SeqAIJKokkos(zcsr); ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); kh.destroy_spadd_handle(); } PetscFunctionReturn(0); } static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) { PetscErrorCode ierr; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); B->offloadmask = PETSC_OFFLOAD_CPU; PetscFunctionReturn(0); } static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) { Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; PetscFunctionBegin; A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ A->boundtocpu = PETSC_FALSE; A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; A->ops->destroy = MatDestroy_SeqAIJKokkos; A->ops->duplicate = MatDuplicate_SeqAIJKokkos; A->ops->axpy = MatAXPY_SeqAIJKokkos; A->ops->scale = MatScale_SeqAIJKokkos; A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; A->ops->mult = MatMult_SeqAIJKokkos; A->ops->multadd = MatMultAdd_SeqAIJKokkos; A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; A->ops->transpose = MatTranspose_SeqAIJKokkos; A->ops->setoption = MatSetOption_SeqAIJKokkos; a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; PetscFunctionReturn(0); } PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) { PetscErrorCode ierr; Mat_SeqAIJ *aseq; PetscInt i,m,n; PetscFunctionBegin; if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); m = akok->nrows(); n = akok->ncols(); ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); /* Set up data structures of A as a MATSEQAIJ */ ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); aseq = (Mat_SeqAIJ*)(A)->data; akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ akok->j_dual.sync_host(); aseq->i = akok->i_host_data(); aseq->j = akok->j_host_data(); aseq->a = akok->a_host_data(); aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ aseq->singlemalloc = PETSC_FALSE; aseq->free_a = PETSC_FALSE; aseq->free_ij = PETSC_FALSE; aseq->nz = akok->nnz(); aseq->maxnz = aseq->nz; ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); for (i=0; iilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; } /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ akok->nonzerostate = A->nonzerostate; ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); A->spptr = akok; PetscFunctionReturn(0); } /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR */ PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) { PetscErrorCode ierr; PetscFunctionBegin; ierr = MatCreate(comm,A);CHKERRQ(ierr); ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); PetscFunctionReturn(0); } /* --------------------------------------------------------------------------------*/ /*@C MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format (the default parallel PETSc format). This matrix will ultimately be handled by Kokkos for calculations. For good matrix assembly performance the user should preallocate the matrix storage by setting the parameter nz (or the array nnz). By setting these parameters accurately, performance during matrix assembly can be increased by more than a factor of 50. Collective Input Parameters: + comm - MPI communicator, set to PETSC_COMM_SELF . m - number of rows . n - number of columns . nz - number of nonzeros per row (same for all rows) - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or NULL Output Parameter: . A - the matrix It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), MatXXXXSetPreallocation() paradgm instead of this routine directly. [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] Notes: If nnz is given then nz is ignored The AIJ format (also called the Yale sparse matrix format or compressed row storage), is fully compatible with standard Fortran 77 storage. That is, the stored row and column indices can begin at either one (as in Fortran) or zero. See the users' manual for details. Specify the preallocated storage with either nz or nnz (not both). Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory allocation. For large problems you MUST preallocate memory or you will get TERRIBLE performance, see the users' manual chapter on matrices. By default, this format uses inodes (identical nodes) when possible, to improve numerical efficiency of matrix-vector products and solves. We search for consecutive rows with the same nonzero structure, thereby reusing matrix information to achieve increased efficiency. Level: intermediate .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() @*/ PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) { PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); ierr = MatCreate(comm,A);CHKERRQ(ierr); ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); PetscFunctionReturn(0); } typedef Kokkos::TeamPolicy<>::member_type team_member; // // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization // static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) { Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; Mat_SeqAIJKokkos *aijkok = static_cast(A->spptr), *baijkok = static_cast(B->spptr); IS isrow = b->row,isicol = b->icol; PetscErrorCode ierr; const PetscInt *r_h,*ic_h; const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); PetscScalar *ba_d = baijkok->a_dual.view_device().data(); PetscBool row_identity,col_identity; PetscInt nc, Nf, nVec=32; // should be a parameter PetscContainer container; PetscFunctionBegin; if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); if (container) { PetscInt *pNf=NULL, nv; ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); Nf = (*pNf)%1000; nv = (*pNf)/1000; if (nv>0) nVec = nv; } else Nf = 1; if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); #endif ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); { #define KOKKOS_SHARED_LEVEL 1 using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; using sizet_scr_t = Kokkos::View; using scalar_scr_t = Kokkos::View; const Kokkos::View > h_r_k (r_h, n); Kokkos::View d_r_k ("r", n); const Kokkos::View > h_ic_k (ic_h, nc); Kokkos::View d_ic_k ("ic", nc); size_t flops_h = 0.0; Kokkos::View h_flops_k (&flops_h); Kokkos::View d_flops_k ("flops"); const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; Kokkos::deep_copy (d_flops_k, h_flops_k); Kokkos::deep_copy (d_r_k, h_r_k); Kokkos::deep_copy (d_ic_k, h_ic_k); // Fill A --> fact Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); // zero rows of B Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag PetscScalar *baL = ba_d + bi_d[rowb]; PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; /* zero (unfactored row) */ for (int j=0;j rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; for (int j=0; j(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA const PetscInt start = field*nloc, end = start + nloc; Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); // A22 panel update for each row A(1,:) and col A(:,1) for (int ii=start; ii= nzUi) /* void */ ; else { const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) size_t st_idx; // find and do L(k,i) = A(:k,i) / A(i,i) Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); // get column, there has got to be a better way Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { if (pjL[j] == ii) { PetscScalar *pLki = ba_d + bi_d[myk] + j; idx = j; // output *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) } }, st_idx); Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); #if defined(PETSC_USE_DEBUG) if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register #endif // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i // U(i+1,:end) Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) PetscScalar Uij = baUi[uiIdx]; PetscInt col = bjUi[uiIdx]; if (col==myk) { // A_kk = A_kk - L_ki * U_ij(k) PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place *Akkv = *Akkv - L_ki() * Uij; // UiK } else { PetscScalar *start, *end, *pAkjv=NULL; PetscInt high, low; const PetscInt *startj; if (col 5) { int t = (low+high)/2; if (startj[t] > col) high = t; else low = t; } for (pAkjv=start+low; pAkjv(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters /* Invert diagonal for simpler triangular solves */ Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { int i = start + outer_index*Ni + lg_rank%Ni; if (i < end) { PetscScalar *pv = ba_d + bdiag_d[i]; *pv = 1.0/(*pv); } }); }); } #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); #endif ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); if (b->inode.size) { B->ops->solve = MatSolve_SeqAIJ_Inode; } else if (row_identity && col_identity) { B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; } else { B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos } B->offloadmask = PETSC_OFFLOAD_GPU; ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; B->ops->matsolve = MatMatSolve_SeqAIJ; B->assembled = PETSC_TRUE; B->preallocated = PETSC_TRUE; PetscFunctionReturn(0); } static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) { PetscErrorCode ierr; PetscFunctionBegin; ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; PetscFunctionReturn(0); } static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) { Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; PetscFunctionBegin; if (!factors->sptrsv_symbolic_completed) { KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); factors->sptrsv_symbolic_completed = PETSC_TRUE; } PetscFunctionReturn(0); } /* Check if we need to update factors etc for transpose solve */ static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) { Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; MatColIdxType n = A->rmap->n; PetscFunctionBegin; if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ /* Update L^T and do sptrsv symbolic */ factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); KokkosKernels::Impl::transpose_matrix< ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, MatRowMapKokkosView,DefaultExecutionSpace>( n,n,factors->iL_d,factors->jL_d,factors->aL_d, factors->iLt_d,factors->jLt_d,factors->aLt_d); /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. */ KokkosKernels::sort_crs_matrix( factors->iLt_d,factors->jLt_d,factors->aLt_d); KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); /* Update U^T and do sptrsv symbolic */ factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); KokkosKernels::Impl::transpose_matrix< ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, MatRowMapKokkosView,DefaultExecutionSpace>( n,n,factors->iU_d, factors->jU_d, factors->aU_d, factors->iUt_d,factors->jUt_d,factors->aUt_d); /* Sort indices. See comments above */ KokkosKernels::sort_crs_matrix( factors->iUt_d,factors->jUt_d,factors->aUt_d); KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); factors->transpose_updated = PETSC_TRUE; } PetscFunctionReturn(0); } /* Solve Ax = b, with A = LU */ static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) { PetscErrorCode ierr; ConstPetscScalarKokkosView bv; PetscScalarKokkosView xv; Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; PetscFunctionBegin; ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); /* Solve L tmpv = b */ CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); /* Solve Ux = tmpv */ CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Solve A^T x = b, where A^T = U^T L^T */ static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) { PetscErrorCode ierr; ConstPetscScalarKokkosView bv; PetscScalarKokkosView xv; Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; PetscFunctionBegin; ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); /* Solve U^T tmpv = b */ KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); /* Solve L^T x = tmpv */ KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); PetscFunctionReturn(0); } static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; PetscInt fill_lev = info->levels; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); auto a_d = aijkok->a_dual.view_device(); auto i_d = aijkok->i_dual.view_device(); auto j_d = aijkok->j_dual.view_device(); KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); B->assembled = PETSC_TRUE; B->preallocated = PETSC_TRUE; B->ops->solve = MatSolve_SeqAIJKokkos; B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; B->ops->matsolve = NULL; B->ops->matsolvetranspose = NULL; B->offloadmask = PETSC_OFFLOAD_GPU; /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ factors->transpose_updated = PETSC_FALSE; factors->sptrsv_symbolic_completed = PETSC_FALSE; PetscFunctionReturn(0); } static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) { PetscErrorCode ierr; Mat_SeqAIJKokkos *aijkok; Mat_SeqAIJ *b; Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; PetscInt fill_lev = info->levels; PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; PetscInt n = A->rmap->n; PetscFunctionBegin; ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); /* Rebuild factors */ if (factors) {factors->Destroy();} /* Destroy the old if it exists */ else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} /* Create a spiluk handle and then do symbolic factorization */ nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); auto spiluk_handle = factors->kh.get_spiluk_handle(); Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); Kokkos::realloc(factors->iU_d,n+1); Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); aijkok = (Mat_SeqAIJKokkos*)A->spptr; auto i_d = aijkok->i_dual.view_device(); auto j_d = aijkok->j_dual.view_device(); KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); /* TODO: add options to select sptrsv algorithms */ /* Create sptrsv handles for L, U and their transpose */ #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; #else auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; #endif factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); /* Fill fields of the factor matrix B */ ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); b = (Mat_SeqAIJ*)B->data; b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); B->info.fill_ratio_given = info->fill; B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); B->offloadmask = PETSC_OFFLOAD_GPU; B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; PetscFunctionReturn(0); } static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) { PetscErrorCode ierr; Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; const PetscInt nrows = A->rmap->n; PetscFunctionBegin; ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; // move B data into Kokkos ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok { Mat_SeqAIJKokkos *baijkok = static_cast(B->spptr); if (!baijkok->diag_d) { const Kokkos::View > h_diag (b->diag,nrows+1); baijkok->diag_d = new Kokkos::View(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); Kokkos::deep_copy (*baijkok->diag_d, h_diag); } } PetscFunctionReturn(0); } static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) { PetscFunctionBegin; *type = MATSOLVERKOKKOS; PetscFunctionReturn(0); } static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) { PetscFunctionBegin; *type = MATSOLVERKOKKOSDEVICE; PetscFunctionReturn(0); } /*MC MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices on a single GPU of type, SeqAIJKokkos, AIJKokkos. Level: beginner .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation M*/ PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor__ */ { PetscErrorCode ierr; PetscInt n = A->rmap->n; PetscFunctionBegin; ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); (*B)->factortype = ftype; ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); if (ftype == MAT_FACTOR_LU) { ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); (*B)->canuseordering = PETSC_TRUE; (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; } else if (ftype == MAT_FACTOR_ILU) { ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); (*B)->canuseordering = PETSC_FALSE; (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); PetscFunctionReturn(0); } PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) { PetscErrorCode ierr; PetscInt n = A->rmap->n; PetscFunctionBegin; ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); (*B)->factortype = ftype; (*B)->canuseordering = PETSC_TRUE; ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); if (ftype == MAT_FACTOR_LU) { ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); PetscFunctionReturn(0); } PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) { PetscErrorCode ierr; PetscFunctionBegin; ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); PetscFunctionReturn(0); } /* Utility to print out a KokkosCsrMatrix for debugging */ PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) { PetscErrorCode ierr; const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); const PetscInt *i = iv.data(); const PetscInt *j = jv.data(); const PetscScalar *a = av.data(); PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); PetscFunctionBegin; ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); for (PetscInt k=0; k