111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 1842550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 28076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 29076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 308c3ff71bSJunchao Zhang 318c3ff71bSJunchao Zhang PetscFunctionBegin; 32076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 339566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A,mode)); 34076ba34aSJunchao Zhang 35076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 36076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 37076ba34aSJunchao Zhang 38076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 39076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 40076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 41076ba34aSJunchao Zhang */ 42076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 43076ba34aSJunchao Zhang delete aijkok; 44076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 45076ba34aSJunchao Zhang A->spptr = aijkok; 46076ba34aSJunchao Zhang } 47076ba34aSJunchao Zhang 485519a089SJose E. Roman if (aijkok->device_mat_d.data()) { 49a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 50a587d139SMark } 518c3ff71bSJunchao Zhang PetscFunctionReturn(0); 528c3ff71bSJunchao Zhang } 538c3ff71bSJunchao Zhang 5486a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 55076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 568c3ff71bSJunchao Zhang { 578c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 588c3ff71bSJunchao Zhang 598c3ff71bSJunchao Zhang PetscFunctionBegin; 605f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 615f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 62076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 63076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 64580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 6586a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 668c3ff71bSJunchao Zhang } 678c3ff71bSJunchao Zhang PetscFunctionReturn(0); 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang 70076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 71fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7286a27549SJunchao Zhang { 7386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7486a27549SJunchao Zhang 7586a27549SJunchao Zhang PetscFunctionBegin; 765f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 7786a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 7886a27549SJunchao Zhang aijkok->a_dual.modify_device(); 7986a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8086a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 829566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 8386a27549SJunchao Zhang PetscFunctionReturn(0); 8486a27549SJunchao Zhang } 8586a27549SJunchao Zhang 86f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 87f0cf5187SStefano Zampini { 88f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 89f0cf5187SStefano Zampini 90f0cf5187SStefano Zampini PetscFunctionBegin; 91f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9286a27549SJunchao Zhang /* We do not expect one needs factors on host */ 935f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 945f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 95076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 96f0cf5187SStefano Zampini PetscFunctionReturn(0); 97f0cf5187SStefano Zampini } 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 100f0cf5187SStefano Zampini { 101076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 102f0cf5187SStefano Zampini 103f0cf5187SStefano Zampini PetscFunctionBegin; 1045519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1055519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1065519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1075519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1085519a089SJose E. Roman */ 1095519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 110076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 111076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 112076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 113076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 114076ba34aSJunchao Zhang } 115076ba34aSJunchao Zhang PetscFunctionReturn(0); 116076ba34aSJunchao Zhang } 117076ba34aSJunchao Zhang 118076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 119076ba34aSJunchao Zhang { 120076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 121076ba34aSJunchao Zhang 122076ba34aSJunchao Zhang PetscFunctionBegin; 1235519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 124076ba34aSJunchao Zhang PetscFunctionReturn(0); 125076ba34aSJunchao Zhang } 126076ba34aSJunchao Zhang 127076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 128076ba34aSJunchao Zhang { 129076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 130076ba34aSJunchao Zhang 131076ba34aSJunchao Zhang PetscFunctionBegin; 1325519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 134076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1352328674fSJunchao Zhang } else { 1362328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1372328674fSJunchao Zhang } 138076ba34aSJunchao Zhang PetscFunctionReturn(0); 139076ba34aSJunchao Zhang } 140076ba34aSJunchao Zhang 141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 142076ba34aSJunchao Zhang { 143076ba34aSJunchao Zhang PetscFunctionBegin; 144076ba34aSJunchao Zhang *array = NULL; 145076ba34aSJunchao Zhang PetscFunctionReturn(0); 146076ba34aSJunchao Zhang } 147076ba34aSJunchao Zhang 148076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 149076ba34aSJunchao Zhang { 150076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 151076ba34aSJunchao Zhang 152076ba34aSJunchao Zhang PetscFunctionBegin; 1535519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1552328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1562328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1572328674fSJunchao Zhang } 158076ba34aSJunchao Zhang PetscFunctionReturn(0); 159076ba34aSJunchao Zhang } 160076ba34aSJunchao Zhang 161076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 162076ba34aSJunchao Zhang { 163076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164076ba34aSJunchao Zhang 165076ba34aSJunchao Zhang PetscFunctionBegin; 1665519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 167076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 168076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1692328674fSJunchao Zhang } 170f0cf5187SStefano Zampini PetscFunctionReturn(0); 171f0cf5187SStefano Zampini } 172f0cf5187SStefano Zampini 1737ee59b9bSJunchao Zhang static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt **i,const PetscInt **j,PetscScalar **a,PetscMemType *mtype) 1747ee59b9bSJunchao Zhang { 1757ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1767ee59b9bSJunchao Zhang 1777ee59b9bSJunchao Zhang PetscFunctionBegin; 1787ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL,PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"aijkok is NULL"); 1797ee59b9bSJunchao Zhang 1807ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1817ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1827ee59b9bSJunchao Zhang if (a) { 1837ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1847ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1857ee59b9bSJunchao Zhang } 1867ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1877ee59b9bSJunchao Zhang PetscFunctionReturn(0); 1887ee59b9bSJunchao Zhang } 1897ee59b9bSJunchao Zhang 190a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 191042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 192a587d139SMark { 193042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 194042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 195a587d139SMark 196a587d139SMark PetscFunctionBegin; 1975f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 198152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 199a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 200a587d139SMark PetscFunctionReturn(0); 201a587d139SMark } 202a587d139SMark 203a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 204042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 205a587d139SMark { 206042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 207a587d139SMark 208a587d139SMark PetscFunctionBegin; 209a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 210a587d139SMark *d_mat = aijkok->device_mat_d.data(); 211a587d139SMark } else { 2129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 213a587d139SMark *d_mat = NULL; 214a587d139SMark } 215a587d139SMark PetscFunctionReturn(0); 216a587d139SMark } 217076ba34aSJunchao Zhang 218076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 219076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 220152b3e56SJunchao Zhang { 221152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 222152b3e56SJunchao Zhang 223152b3e56SJunchao Zhang PetscFunctionBegin; 2245f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 225076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 226076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 2279566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 2289566063dSJacob Faibussowitsch PetscCallCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 2299566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 23086a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 231076ba34aSJunchao Zhang } 232076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 233152b3e56SJunchao Zhang PetscFunctionReturn(0); 234152b3e56SJunchao Zhang } 235152b3e56SJunchao Zhang 236076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 237076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 238152b3e56SJunchao Zhang { 239152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 240152b3e56SJunchao Zhang 241152b3e56SJunchao Zhang PetscFunctionBegin; 2429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2435f80ce2aSJacob Faibussowitsch PetscCheck(aijkok,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 244076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 2459566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 2469566063dSJacob Faibussowitsch PetscCallCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 2479566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 248076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 249076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 250076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 251076ba34aSJunchao Zhang #endif 25286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 253076ba34aSJunchao Zhang } 254076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 2559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 256152b3e56SJunchao Zhang PetscFunctionReturn(0); 257152b3e56SJunchao Zhang } 258a587d139SMark 2598c3ff71bSJunchao Zhang /* y = A x */ 2608c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2618c3ff71bSJunchao Zhang { 2628c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 263152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 264152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2658c3ff71bSJunchao Zhang 2668c3ff71bSJunchao Zhang PetscFunctionBegin; 2679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2699566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 2709566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 2718c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 272152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 2739566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 2749566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 275076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 2769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 2779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2788c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2798c3ff71bSJunchao Zhang } 2808c3ff71bSJunchao Zhang 2818c3ff71bSJunchao Zhang /* y = A^T x */ 2828c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2838c3ff71bSJunchao Zhang { 2848c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 285152b3e56SJunchao Zhang const char *mode; 286152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 287152b3e56SJunchao Zhang PetscScalarKokkosView yv; 288076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang PetscFunctionBegin; 2919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2939566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 2949566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 295152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 2969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 297152b3e56SJunchao Zhang mode = "N"; 298152b3e56SJunchao Zhang } else { 299076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 300076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 301152b3e56SJunchao Zhang mode = "T"; 302152b3e56SJunchao Zhang } 303076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 3049566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3059566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 3069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3088c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3098c3ff71bSJunchao Zhang } 3108c3ff71bSJunchao Zhang 3118c3ff71bSJunchao Zhang /* y = A^H x */ 3128c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 3138c3ff71bSJunchao Zhang { 3148c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 315152b3e56SJunchao Zhang const char *mode; 316152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 317152b3e56SJunchao Zhang PetscScalarKokkosView yv; 318076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3198c3ff71bSJunchao Zhang 3208c3ff71bSJunchao Zhang PetscFunctionBegin; 3219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3239566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3249566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy,&yv)); 325152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 327152b3e56SJunchao Zhang mode = "N"; 328152b3e56SJunchao Zhang } else { 329076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 330076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 331152b3e56SJunchao Zhang mode = "C"; 332152b3e56SJunchao Zhang } 333076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 3349566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3359566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy,&yv)); 3369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3388c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3398c3ff71bSJunchao Zhang } 3408c3ff71bSJunchao Zhang 3418c3ff71bSJunchao Zhang /* z = A x + y */ 3428c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3438c3ff71bSJunchao Zhang { 3448c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 345152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 346152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3478c3ff71bSJunchao Zhang 3488c3ff71bSJunchao Zhang PetscFunctionBegin; 3499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3519566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3529566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 3539566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 3548c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3558c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 356152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 3579566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3589566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 3599566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 3609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*aijkok->csrmat.nnz())); 3619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3638c3ff71bSJunchao Zhang } 3648c3ff71bSJunchao Zhang 3658c3ff71bSJunchao Zhang /* z = A^T x + y */ 3668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3678c3ff71bSJunchao Zhang { 3688c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 369152b3e56SJunchao Zhang const char *mode; 370152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 371152b3e56SJunchao Zhang PetscScalarKokkosView zv; 372076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3738c3ff71bSJunchao Zhang 3748c3ff71bSJunchao Zhang PetscFunctionBegin; 3759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3779566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 3789566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 3799566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 3808c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 381152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat)); 383152b3e56SJunchao Zhang mode = "N"; 384152b3e56SJunchao Zhang } else { 385076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 386076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 387152b3e56SJunchao Zhang mode = "T"; 388152b3e56SJunchao Zhang } 389076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 3909566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 3919566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 3929566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 3939566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 3949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3958c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3968c3ff71bSJunchao Zhang } 3978c3ff71bSJunchao Zhang 3988c3ff71bSJunchao Zhang /* z = A^H x + y */ 3998c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 4008c3ff71bSJunchao Zhang { 4018c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 402152b3e56SJunchao Zhang const char *mode; 403152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 404152b3e56SJunchao Zhang PetscScalarKokkosView zv; 405076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 4068c3ff71bSJunchao Zhang 4078c3ff71bSJunchao Zhang PetscFunctionBegin; 4089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4109566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx,&xv)); 4119566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy,&yv)); 4129566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz,&zv)); 4138c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 414152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat)); 416152b3e56SJunchao Zhang mode = "N"; 417152b3e56SJunchao Zhang } else { 418076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 419076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 420152b3e56SJunchao Zhang mode = "C"; 421152b3e56SJunchao Zhang } 422076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 4239566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx,&xv)); 4249566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy,&yv)); 4259566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz,&zv)); 4269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0*csrmat->nnz())); 4279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 428152b3e56SJunchao Zhang PetscFunctionReturn(0); 429152b3e56SJunchao Zhang } 430152b3e56SJunchao Zhang 431152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 432152b3e56SJunchao Zhang { 433152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 434152b3e56SJunchao Zhang 435152b3e56SJunchao Zhang PetscFunctionBegin; 436152b3e56SJunchao Zhang switch (op) { 437152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 438152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 4399566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 440152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 441152b3e56SJunchao Zhang break; 442152b3e56SJunchao Zhang default: 4439566063dSJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A,op,flg)); 444152b3e56SJunchao Zhang break; 445152b3e56SJunchao Zhang } 4468c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4478c3ff71bSJunchao Zhang } 4488c3ff71bSJunchao Zhang 449076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4503d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4518c3ff71bSJunchao Zhang { 452076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4538c3ff71bSJunchao Zhang 4548c3ff71bSJunchao Zhang PetscFunctionBegin; 4559566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 456076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 4579566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A,MAT_COPY_VALUES,newmat)); /* the returned newmat is a SeqAIJKokkos */ 4588c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 4599566063dSJacob Faibussowitsch PetscCall(MatCopy(A,*newmat,SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 460076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4615f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat,PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 4629566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4639566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS,&A->defaultvectype)); /* Allocate and copy the string */ 4649566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS)); 4659566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 466076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 467394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4685f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 469076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4708c3ff71bSJunchao Zhang } 471076ba34aSJunchao Zhang } 4728c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4738c3ff71bSJunchao Zhang } 4748c3ff71bSJunchao Zhang 475076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 476076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 477076ba34aSJunchao Zhang */ 478076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4798c3ff71bSJunchao Zhang { 480076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 481076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 482076ba34aSJunchao Zhang Mat mat; 4838c3ff71bSJunchao Zhang 4848c3ff71bSJunchao Zhang PetscFunctionBegin; 485076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 4869566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B)); 487076ba34aSJunchao Zhang mat = *B; 488076ba34aSJunchao Zhang if (A->assembled) { 489076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 490076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 491076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 492076ba34aSJunchao Zhang /* Now copy values to B if needed */ 493076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 494076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 495076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 496076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 497076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 498076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 499076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 500076ba34aSJunchao Zhang } 501076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 502076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 503076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 504076ba34aSJunchao Zhang } 505076ba34aSJunchao Zhang mat->spptr = bkok; 506076ba34aSJunchao Zhang } 507076ba34aSJunchao Zhang 5089566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5099566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS,&mat->defaultvectype)); /* Allocate and copy the string */ 5109566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS)); 5119566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5128c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5138c3ff71bSJunchao Zhang } 5148c3ff71bSJunchao Zhang 5150ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5160ecb592aSJunchao Zhang { 5170ecb592aSJunchao Zhang Mat At; 518ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5190ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5200ecb592aSJunchao Zhang 5210ecb592aSJunchao Zhang PetscFunctionBegin; 5229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&internT)); /* Generate a transpose internally */ 5230ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 524ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 5259566063dSJacob Faibussowitsch PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat",*internT))); 5269566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At)); 5270ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5289566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A,&At)); /* Replace A with At inplace */ 5290ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5300ecb592aSJunchao Zhang if ((*B)->assembled) { 5310ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5329566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 5340ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5350ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5360ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5370ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5389566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(a_h,internT->values)); 5399566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5400ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5410ecb592aSJunchao Zhang } 5420ecb592aSJunchao Zhang PetscFunctionReturn(0); 5430ecb592aSJunchao Zhang } 5440ecb592aSJunchao Zhang 5458c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5468c3ff71bSJunchao Zhang { 54786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5488c3ff71bSJunchao Zhang 5498c3ff71bSJunchao Zhang PetscFunctionBegin; 55086a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 55186a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5528c3ff71bSJunchao Zhang delete aijkok; 55386a27549SJunchao Zhang } else { 55486a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 55586a27549SJunchao Zhang } 556cbc6b225SStefano Zampini A->spptr = NULL; 5579566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL)); 5589566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 5599566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 5609566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 5618c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5628c3ff71bSJunchao Zhang } 5638c3ff71bSJunchao Zhang 5643f3ba80aSJunchao Zhang /*MC 5653f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5663f3ba80aSJunchao Zhang 5673f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5683f3ba80aSJunchao Zhang 5693f3ba80aSJunchao Zhang Options Database Keys: 5703f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 5713f3ba80aSJunchao Zhang 5723f3ba80aSJunchao Zhang Level: beginner 5733f3ba80aSJunchao Zhang 574db781477SPatrick Sanan .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 5753f3ba80aSJunchao Zhang M*/ 57686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 57786a27549SJunchao Zhang { 57886a27549SJunchao Zhang PetscFunctionBegin; 5799566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 5809566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 5819566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A)); 58286a27549SJunchao Zhang PetscFunctionReturn(0); 58386a27549SJunchao Zhang } 58486a27549SJunchao Zhang 585076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 586076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 587a3f881fbSStefano Zampini { 588076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 589076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 590076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 591076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 592076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 593076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 594a3f881fbSStefano Zampini 595a3f881fbSStefano Zampini PetscFunctionBegin; 596076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 597076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 598076ba34aSJunchao Zhang PetscValidPointer(C,4); 599076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 600076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 6015f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 6025f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX,PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 603076ba34aSJunchao Zhang 6049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 606076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 607076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 608076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 609076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 610076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 611076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 612076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 613076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 614076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 615076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 616076ba34aSJunchao Zhang nnz = a->nz + b->nz; 617076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 618076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 619076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 620076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 621076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 622076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 623076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 624076ba34aSJunchao Zhang ca = ca_dual.view_device(); 625076ba34aSJunchao Zhang ci = ci_dual.view_device(); 626076ba34aSJunchao Zhang cj = cj_dual.view_device(); 627076ba34aSJunchao Zhang 628076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 629076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 630076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 631076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 632076ba34aSJunchao Zhang 633076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 634076ba34aSJunchao Zhang ci(i) = coffset; 635076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 636076ba34aSJunchao Zhang }); 637076ba34aSJunchao Zhang 638076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 639076ba34aSJunchao Zhang if (k < alen) { 640076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 641076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 642076ba34aSJunchao Zhang } else { 643076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 644076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 645076ba34aSJunchao Zhang } 646076ba34aSJunchao Zhang }); 647076ba34aSJunchao Zhang }); 648076ba34aSJunchao Zhang ca_dual.modify_device(); 649076ba34aSJunchao Zhang ci_dual.modify_device(); 650076ba34aSJunchao Zhang cj_dual.modify_device(); 6519566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 6529566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C)); 653076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 654076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 655076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 656076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 657076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 658076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 659076ba34aSJunchao Zhang 660076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 661076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 662076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 663076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 664076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 665076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 666076ba34aSJunchao Zhang }); 667076ba34aSJunchao Zhang }); 6689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 669076ba34aSJunchao Zhang } 670076ba34aSJunchao Zhang PetscFunctionReturn(0); 671076ba34aSJunchao Zhang } 672076ba34aSJunchao Zhang 673076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 674076ba34aSJunchao Zhang { 675076ba34aSJunchao Zhang PetscFunctionBegin; 676076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 677a3f881fbSStefano Zampini PetscFunctionReturn(0); 678a3f881fbSStefano Zampini } 679a3f881fbSStefano Zampini 680a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 681a3f881fbSStefano Zampini { 682a3f881fbSStefano Zampini Mat_Product *product = C->product; 683a3f881fbSStefano Zampini Mat A,B; 684076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 685a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 686a3f881fbSStefano Zampini Mat_SeqAIJ *c; 687076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 688076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 689a3f881fbSStefano Zampini 690a3f881fbSStefano Zampini PetscFunctionBegin; 691a3f881fbSStefano Zampini MatCheckProduct(C,1); 6925f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 693076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 694076ba34aSJunchao Zhang 695076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 696076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 697076ba34aSJunchao Zhang PetscFunctionReturn(0); 698076ba34aSJunchao Zhang } 699076ba34aSJunchao Zhang 700076ba34aSJunchao Zhang switch (product->type) { 701076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 702076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 703076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 704076ba34aSJunchao Zhang default: 70598921bdaSJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 706076ba34aSJunchao Zhang } 707076ba34aSJunchao Zhang 708a3f881fbSStefano Zampini A = product->A; 709a3f881fbSStefano Zampini B = product->B; 7109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 713a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 714a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 715076ba34aSJunchao Zhang 7165f80ce2aSJacob Faibussowitsch PetscCheck(ckok,PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 717076ba34aSJunchao Zhang 718076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 719076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 720076ba34aSJunchao Zhang 721076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 722076ba34aSJunchao Zhang if (transA) { 7239566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 724076ba34aSJunchao Zhang transA = false; 725a3f881fbSStefano Zampini } 726a3f881fbSStefano Zampini 727076ba34aSJunchao Zhang if (transB) { 7289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 729076ba34aSJunchao Zhang transB = false; 730076ba34aSJunchao Zhang } 7319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 7329566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 7339566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 7349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 7359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 736a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 737a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 7389566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz)); 7399566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n")); 7409566063dSJacob Faibussowitsch PetscCall(PetscInfo(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax)); 741a3f881fbSStefano Zampini c->reallocs = 0; 742076ba34aSJunchao Zhang C->info.mallocs = 0; 743a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 744a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 745a3f881fbSStefano Zampini C->num_ass++; 746a3f881fbSStefano Zampini PetscFunctionReturn(0); 747a3f881fbSStefano Zampini } 748a3f881fbSStefano Zampini 749a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 750a3f881fbSStefano Zampini { 751076ba34aSJunchao Zhang Mat_Product *product = C->product; 752076ba34aSJunchao Zhang MatProductType ptype; 753076ba34aSJunchao Zhang Mat A,B; 754076ba34aSJunchao Zhang bool transA,transB; 755076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 756076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 757076ba34aSJunchao Zhang MPI_Comm comm; 758076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 759a3f881fbSStefano Zampini 760a3f881fbSStefano Zampini PetscFunctionBegin; 761a3f881fbSStefano Zampini MatCheckProduct(C,1); 7629566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C,&comm)); 7635f80ce2aSJacob Faibussowitsch PetscCheck(!product->data,comm,PETSC_ERR_PLIB,"Product data not empty"); 764a3f881fbSStefano Zampini A = product->A; 765a3f881fbSStefano Zampini B = product->B; 7669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 768a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 769a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 770076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 771076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 772076ba34aSJunchao Zhang 773a3f881fbSStefano Zampini ptype = product->type; 774a3f881fbSStefano Zampini switch (ptype) { 775076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 776076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 777076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 778a3f881fbSStefano Zampini default: 77998921bdaSJacob Faibussowitsch SETERRQ(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 780a3f881fbSStefano Zampini } 781a3f881fbSStefano Zampini 782076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 783076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 784076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 785076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 786a3f881fbSStefano Zampini 787076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 788076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7896ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7906ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7916ffb9508SJunchao Zhang */ 792076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 793076ba34aSJunchao Zhang 7949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 795076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 796076ba34aSJunchao Zhang if (transA) { 7979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA)); 798076ba34aSJunchao Zhang transA = false; 799076ba34aSJunchao Zhang } 800076ba34aSJunchao Zhang 801076ba34aSJunchao Zhang if (transB) { 8029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB)); 803076ba34aSJunchao Zhang transB = false; 804076ba34aSJunchao Zhang } 805076ba34aSJunchao Zhang 8069566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 808076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 809076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 810076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 811076ba34aSJunchao Zhang */ 8129566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 8139566063dSJacob Faibussowitsch PetscCallCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 8149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 815076ba34aSJunchao Zhang 8169566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 8179566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C,ckok)); 818076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 819a3f881fbSStefano Zampini PetscFunctionReturn(0); 820a3f881fbSStefano Zampini } 821a3f881fbSStefano Zampini 822a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 823076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 824a3f881fbSStefano Zampini { 825076ba34aSJunchao Zhang Mat_Product *product = mat->product; 826a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 827a3f881fbSStefano Zampini 828a3f881fbSStefano Zampini PetscFunctionBegin; 829a3f881fbSStefano Zampini MatCheckProduct(mat,1); 8309566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok)); 831a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 8329566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok)); 833a3f881fbSStefano Zampini } 834a3f881fbSStefano Zampini if (Biskok && Ciskok) { 835a3f881fbSStefano Zampini switch (product->type) { 836a3f881fbSStefano Zampini case MATPRODUCT_AB: 837a3f881fbSStefano Zampini case MATPRODUCT_AtB: 838a3f881fbSStefano Zampini case MATPRODUCT_ABt: 839a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 840a3f881fbSStefano Zampini break; 841a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 842a3f881fbSStefano Zampini case MATPRODUCT_RARt: 843a3f881fbSStefano Zampini case MATPRODUCT_ABC: 844a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 845a3f881fbSStefano Zampini break; 846a3f881fbSStefano Zampini default: 847a3f881fbSStefano Zampini break; 848a3f881fbSStefano Zampini } 849a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 8509566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 851a3f881fbSStefano Zampini } 852a3f881fbSStefano Zampini PetscFunctionReturn(0); 853a3f881fbSStefano Zampini } 854a587d139SMark 855f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 856f0cf5187SStefano Zampini { 857f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 858f0cf5187SStefano Zampini 859f0cf5187SStefano Zampini PetscFunctionBegin; 8609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 862f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 863076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 8649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 8669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 867f0cf5187SStefano Zampini PetscFunctionReturn(0); 868f0cf5187SStefano Zampini } 869f0cf5187SStefano Zampini 870a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 871a587d139SMark { 872076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 873a587d139SMark 874a587d139SMark PetscFunctionBegin; 875076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8762328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 877076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 8789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8792328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8809566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 8812328674fSJunchao Zhang } 882a587d139SMark PetscFunctionReturn(0); 883a587d139SMark } 884a587d139SMark 885f78ce678SMark Adams static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x) 886f78ce678SMark Adams { 887f78ce678SMark Adams Mat_SeqAIJ *aijseq; 888f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 889f78ce678SMark Adams PetscInt n; 890f78ce678SMark Adams PetscScalarKokkosView xv; 891f78ce678SMark Adams 892f78ce678SMark Adams PetscFunctionBegin; 893f78ce678SMark Adams PetscCall(VecGetLocalSize(x,&n)); 894f78ce678SMark Adams PetscCheck(n == A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector"); 895f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 896f78ce678SMark Adams 897f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 898f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 899f78ce678SMark Adams 900f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 901f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 902f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ*>(A->data); 903f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 904f78ce678SMark Adams } 905f78ce678SMark Adams 906f78ce678SMark Adams const auto& Aa = aijkok->a_dual.view_device(); 907f78ce678SMark Adams const auto& Ai = aijkok->i_dual.view_device(); 908f78ce678SMark Adams const auto& Adiag = aijkok->diag_dual.view_device(); 909f78ce678SMark Adams 910f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x,&xv)); 911f78ce678SMark Adams Kokkos::parallel_for(n,KOKKOS_LAMBDA(const PetscInt i) { 912f78ce678SMark Adams if (Adiag(i) < Ai(i+1)) xv(i) = Aa(Adiag(i)); 913f78ce678SMark Adams else xv(i) = 0; 914f78ce678SMark Adams }); 915f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 916f78ce678SMark Adams PetscFunctionReturn(0); 917f78ce678SMark Adams } 918f78ce678SMark Adams 919db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 92042550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 921db78de30SJunchao Zhang { 922db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 923db78de30SJunchao Zhang 924db78de30SJunchao Zhang PetscFunctionBegin; 925db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 926db78de30SJunchao Zhang PetscValidPointer(kv,2); 927db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 929db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 930076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 931db78de30SJunchao Zhang PetscFunctionReturn(0); 932db78de30SJunchao Zhang } 933db78de30SJunchao Zhang 93442550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 935db78de30SJunchao Zhang { 936db78de30SJunchao Zhang PetscFunctionBegin; 937db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 938db78de30SJunchao Zhang PetscValidPointer(kv,2); 939db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 940db78de30SJunchao Zhang PetscFunctionReturn(0); 941db78de30SJunchao Zhang } 942db78de30SJunchao Zhang 94342550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 944db78de30SJunchao Zhang { 945db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 946db78de30SJunchao Zhang 947db78de30SJunchao Zhang PetscFunctionBegin; 948db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 949db78de30SJunchao Zhang PetscValidPointer(kv,2); 950db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 952db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 953076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 954db78de30SJunchao Zhang PetscFunctionReturn(0); 955db78de30SJunchao Zhang } 956db78de30SJunchao Zhang 95742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 958db78de30SJunchao Zhang { 959db78de30SJunchao Zhang PetscFunctionBegin; 960db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 961db78de30SJunchao Zhang PetscValidPointer(kv,2); 962db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 964db78de30SJunchao Zhang PetscFunctionReturn(0); 965db78de30SJunchao Zhang } 966db78de30SJunchao Zhang 96742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 968db78de30SJunchao Zhang { 969db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 970db78de30SJunchao Zhang 971db78de30SJunchao Zhang PetscFunctionBegin; 972db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 973db78de30SJunchao Zhang PetscValidPointer(kv,2); 974db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 975db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 976076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 977db78de30SJunchao Zhang PetscFunctionReturn(0); 978db78de30SJunchao Zhang } 979db78de30SJunchao Zhang 98042550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 981db78de30SJunchao Zhang { 982db78de30SJunchao Zhang PetscFunctionBegin; 983db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 984db78de30SJunchao Zhang PetscValidPointer(kv,2); 985db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 987db78de30SJunchao Zhang PetscFunctionReturn(0); 988db78de30SJunchao Zhang } 989db78de30SJunchao Zhang 990c17cf699SJunchao Zhang /* Computes Y += alpha X */ 991c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 992a587d139SMark { 993a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 994c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 995c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 996c17cf699SJunchao Zhang MatScalarKokkosView Ya; 997a587d139SMark 998a587d139SMark PetscFunctionBegin; 999c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 1000c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 10019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 10029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 10039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1004db78de30SJunchao Zhang 1005c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1006c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1007a587d139SMark PetscBool e; 10089566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e)); 1009a587d139SMark if (e) { 10109566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j,y->j,y->nz,&e)); 1011c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1012a587d139SMark } 1013a587d139SMark } 1014db78de30SJunchao Zhang 1015c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1016c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1017c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1018c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1019c17cf699SJunchao Zhang */ 1020c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 1021c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 1022c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1023c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1024c17cf699SJunchao Zhang 1025c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1026c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 10279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1028c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1029c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1030c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1031c17cf699SJunchao Zhang 1032c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1033c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1034c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1035c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 1036c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1037c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1038c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1039c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1040c17cf699SJunchao Zhang q++; 1041a587d139SMark } else { 1042c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1043c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1044c17cf699SJunchao Zhang */ 10458b8b16f9SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = 10468b8b16f9SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3,6,99) 10478b8b16f9SJunchao Zhang Kokkos::nan("1"); /* auto promote the double NaN if needed */ 10488b8b16f9SJunchao Zhang #else 10498b8b16f9SJunchao Zhang Kokkos::Experimental::nan("1"); 10508b8b16f9SJunchao Zhang #endif 1051a587d139SMark } 1052c17cf699SJunchao Zhang } 1053c17cf699SJunchao Zhang }); 1054c17cf699SJunchao Zhang }); 10559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1056c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1057c17cf699SJunchao Zhang Mat Z; 1058c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1059c17cf699SJunchao Zhang KernelHandle kh; 1060c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1061c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1062c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1063c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 10649566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z)); 10659566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y,&Z)); 1066c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1067c17cf699SJunchao Zhang } 10689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0)*2)); /* Because we scaled X and then added it to Y */ 1070a587d139SMark PetscFunctionReturn(0); 1071a587d139SMark } 1072a587d139SMark 1073*e8729f6fSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 107442550becSJunchao Zhang { 107542550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 107642550becSJunchao Zhang Mat_SeqAIJ *aseq; 107742550becSJunchao Zhang 107842550becSJunchao Zhang PetscFunctionBegin; 10799566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat,coo_n,coo_i,coo_j)); 1080394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(mat->data); 108142550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 1082cbc6b225SStefano Zampini delete akok; 1083cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,mat->nonzerostate+1,PETSC_FALSE); 10849566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1085394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 108642550becSJunchao Zhang PetscFunctionReturn(0); 108742550becSJunchao Zhang } 108842550becSJunchao Zhang 108942550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 109042550becSJunchao Zhang { 109142550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 109242550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 1093394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1094394ed5ebSJunchao Zhang const PetscCountKokkosView& jmap = akok->jmap_d; 1095394ed5ebSJunchao Zhang const PetscCountKokkosView& perm = akok->perm_d; 109642550becSJunchao Zhang MatScalarKokkosView Aa; 109742550becSJunchao Zhang ConstMatScalarKokkosView kv; 109842550becSJunchao Zhang PetscMemType memtype; 109942550becSJunchao Zhang 110042550becSJunchao Zhang PetscFunctionBegin; 11019566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v,&memtype)); 110242550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1103394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,aseq->coo_n)); 110442550becSJunchao Zhang } else { 1105394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v,aseq->coo_n); /* Directly use v[]'s memory */ 110642550becSJunchao Zhang } 110742550becSJunchao Zhang 1108c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); /* write matrix values */ 1109c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A,&Aa)); /* read & write matrix values */ 111042550becSJunchao Zhang 1111c7b718f4SJunchao Zhang Kokkos::parallel_for(Annz,KOKKOS_LAMBDA(const PetscCount i) { 1112c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1113c7b718f4SJunchao Zhang for (PetscCount k=jmap(i); k<jmap(i+1); k++) sum += kv(perm(k)); 1114c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES? 0.0 : Aa(i)) + sum; 1115c7b718f4SJunchao Zhang }); 1116394ed5ebSJunchao Zhang 11179566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 11189566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A,&Aa)); 111942550becSJunchao Zhang PetscFunctionReturn(0); 112042550becSJunchao Zhang } 112142550becSJunchao Zhang 11225fbaff96SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A,const PetscInt *diag) 11235fbaff96SJunchao Zhang { 11245fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 11255fbaff96SJunchao Zhang MatScalarKokkosView Aa; 11265fbaff96SJunchao Zhang const MatRowMapKokkosView& Ai = akok->i_dual.view_device(); 11275fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 11285fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag,m); /* diag is a device pointer */ 11295fbaff96SJunchao Zhang 11305fbaff96SJunchao Zhang PetscFunctionBegin; 11315fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A,&Aa)); 11325fbaff96SJunchao Zhang Kokkos::parallel_for(m,KOKKOS_LAMBDA(const PetscInt i) { 11335fbaff96SJunchao Zhang PetscScalar tmp; 11345fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i+1)) { /* The diagonal element exists */ 11355fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 11365fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 11375fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 11385fbaff96SJunchao Zhang } 11395fbaff96SJunchao Zhang }); 11405fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A,&Aa)); 11415fbaff96SJunchao Zhang PetscFunctionReturn(0); 11425fbaff96SJunchao Zhang } 11435fbaff96SJunchao Zhang 114486a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 11458f7e8f9dSMark Adams { 11468f7e8f9dSMark Adams PetscFunctionBegin; 11479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 11489566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B,A,info)); 11498f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 11508f7e8f9dSMark Adams PetscFunctionReturn(0); 11518f7e8f9dSMark Adams } 11528f7e8f9dSMark Adams 11538c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 11548c3ff71bSJunchao Zhang { 1155076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1156076ba34aSJunchao Zhang 11578c3ff71bSJunchao Zhang PetscFunctionBegin; 1158076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 11596f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 11606f3d89d0SStefano Zampini 11618c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 11628c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 11638c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1164a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1165f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1166a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1167076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11688c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11698c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11708c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11718c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11728c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11738c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1174076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11750ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1176152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1177f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1178076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1179076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1180076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1181076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1182076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1183076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 11847ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 118542550becSJunchao Zhang 11869566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos)); 11879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_SeqAIJKokkos)); 1188076ba34aSJunchao Zhang PetscFunctionReturn(0); 1189076ba34aSJunchao Zhang } 1190076ba34aSJunchao Zhang 1191076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1192076ba34aSJunchao Zhang { 1193076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1194076ba34aSJunchao Zhang PetscInt i,m,n; 1195076ba34aSJunchao Zhang 1196076ba34aSJunchao Zhang PetscFunctionBegin; 11975f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr,PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1198076ba34aSJunchao Zhang 1199076ba34aSJunchao Zhang m = akok->nrows(); 1200076ba34aSJunchao Zhang n = akok->ncols(); 12019566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A,m,n,m,n)); 12029566063dSJacob Faibussowitsch PetscCall(MatSetType(A,MATSEQAIJKOKKOS)); 1203076ba34aSJunchao Zhang 1204076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 12059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL)); 1206076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1207076ba34aSJunchao Zhang 1208076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1209076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1210076ba34aSJunchao Zhang 1211076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1212076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1213076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1214076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1215076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1216076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1217076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1218076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1219076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1220076ba34aSJunchao Zhang 12219566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&aseq->imax)); 12229566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m,&aseq->ilen)); 1223076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1224076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1225076ba34aSJunchao Zhang } 1226076ba34aSJunchao Zhang 1227076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1228076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1229ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 12309566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY)); 12319566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY)); 1232076ba34aSJunchao Zhang PetscFunctionReturn(0); 1233076ba34aSJunchao Zhang } 1234076ba34aSJunchao Zhang 1235076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1236076ba34aSJunchao Zhang 1237076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1238076ba34aSJunchao Zhang */ 1239076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1240076ba34aSJunchao Zhang { 1241076ba34aSJunchao Zhang PetscFunctionBegin; 12429566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 12439566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A,akok)); 12448c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12458c3ff71bSJunchao Zhang } 12468c3ff71bSJunchao Zhang 12478c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1248152b3e56SJunchao Zhang /*@C 12498c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 12508c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12518c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 12528c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12538c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12548c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 12558c3ff71bSJunchao Zhang 12568c3ff71bSJunchao Zhang Collective 12578c3ff71bSJunchao Zhang 12588c3ff71bSJunchao Zhang Input Parameters: 12598c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 12608c3ff71bSJunchao Zhang . m - number of rows 12618c3ff71bSJunchao Zhang . n - number of columns 12628c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 12638c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 12648c3ff71bSJunchao Zhang (possibly different for each row) or NULL 12658c3ff71bSJunchao Zhang 12668c3ff71bSJunchao Zhang Output Parameter: 12678c3ff71bSJunchao Zhang . A - the matrix 12688c3ff71bSJunchao Zhang 12698c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12708c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12718c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12728c3ff71bSJunchao Zhang 12738c3ff71bSJunchao Zhang Notes: 12748c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12758c3ff71bSJunchao Zhang 12768c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12778c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12788c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12798c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12808c3ff71bSJunchao Zhang 12818c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12828c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12838c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12848c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12858c3ff71bSJunchao Zhang 12868c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12878c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12888c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12898c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12908c3ff71bSJunchao Zhang 12918c3ff71bSJunchao Zhang Level: intermediate 12928c3ff71bSJunchao Zhang 1293db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 12948c3ff71bSJunchao Zhang @*/ 12958c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 12968c3ff71bSJunchao Zhang { 12978c3ff71bSJunchao Zhang PetscFunctionBegin; 12989566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 12999566063dSJacob Faibussowitsch PetscCall(MatCreate(comm,A)); 13009566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A,m,n,m,n)); 13019566063dSJacob Faibussowitsch PetscCall(MatSetType(*A,MATSEQAIJKOKKOS)); 13029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz)); 13038c3ff71bSJunchao Zhang PetscFunctionReturn(0); 13048c3ff71bSJunchao Zhang } 1305930e68a5SMark Adams 13068f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 13078f7e8f9dSMark Adams // 130846804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 13098f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 13108f7e8f9dSMark Adams // 13118f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1312930e68a5SMark Adams { 13138f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 13148f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 13158f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 13168f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1317300d22a6SJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1318076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1319076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 13208f7e8f9dSMark Adams PetscBool row_identity,col_identity; 132146804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1322930e68a5SMark Adams 1323930e68a5SMark Adams PetscFunctionBegin; 13242c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 1325b94d7dedSBarry Smith PetscCall(MatIsStructurallySymmetric(A,&row_identity)); 13262c71b3e2SJacob Faibussowitsch PetscCheck(row_identity,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 13279566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow,&r_h)); 13289566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol,&ic_h)); 13299566063dSJacob Faibussowitsch PetscCall(ISGetSize(isicol,&nc)); 13309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 13319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13328f7e8f9dSMark Adams { 13338f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 13348f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 13358f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 13368f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 13378f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 13388f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 13398f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 13408f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 13418f7e8f9dSMark Adams size_t flops_h = 0.0; 13428f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 13438f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 13448f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 13458f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 13468f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 13478f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 13488f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 13498f7e8f9dSMark Adams // Fill A --> fact 13508f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1351042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 13528f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 13538f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13548f7e8f9dSMark Adams // zero rows of B 13558f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13568f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 13578f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 13588f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 13598f7e8f9dSMark Adams /* zero (unfactored row) */ 13608f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 13618f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 13628f7e8f9dSMark Adams }); 13638f7e8f9dSMark Adams // copy A into B 13648f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 13658f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 13668f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 13678f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 13688f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13698f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 13708f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13718f7e8f9dSMark Adams PetscScalar vala = av[j]; 13728f7e8f9dSMark Adams if (colb == rowb) { 13738f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13748f7e8f9dSMark Adams } else { 13758f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13768f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 13778f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 13788f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 13798f7e8f9dSMark Adams if (pbj[j] == colb) { 13808f7e8f9dSMark Adams pba[j] = vala; 13818f7e8f9dSMark Adams set++; 13828f7e8f9dSMark Adams break; 13838f7e8f9dSMark Adams } 13848f7e8f9dSMark Adams } 13858f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13868f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13878f1da0b2SJunchao Zhang #endif 13888f7e8f9dSMark Adams } 13898f7e8f9dSMark Adams } 13908f7e8f9dSMark Adams }); 13918f7e8f9dSMark Adams }); 13928f7e8f9dSMark Adams Kokkos::fence(); 1393930e68a5SMark Adams 13948f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 13958f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13968f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13978f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1398042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 13998f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 14008f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 14018f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 14028f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 14038f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 14048f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 14058f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 14068f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 14078f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 14088f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 14098f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 14108f7e8f9dSMark Adams else { 14118f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 14128f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 14138f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 14148f7e8f9dSMark Adams size_t st_idx; 14158f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 14168f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 14178f7e8f9dSMark Adams // get column, there has got to be a better way 14188f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 14198f7e8f9dSMark Adams if (pjL[j] == ii) { 14208f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 14218f7e8f9dSMark Adams idx = j; // output 14228f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 14238f7e8f9dSMark Adams } 14248f7e8f9dSMark Adams }, st_idx); 14258f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 14268f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 142799551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 142899551766SMark Adams #endif 142999551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 14308f7e8f9dSMark Adams // U(i+1,:end) 14318f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 14328f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 14338f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 14348f7e8f9dSMark Adams if (col==myk) { 14358f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 14368f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 14378f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 14388f7e8f9dSMark Adams } else { 14398f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 14408f7e8f9dSMark Adams PetscInt high, low; 14418f7e8f9dSMark Adams const PetscInt *startj; 14428f7e8f9dSMark Adams if (col<myk) { // L 14438f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 14448f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 14458f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 14468f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 14478f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 14488f7e8f9dSMark Adams } else { 14498f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 14508f7e8f9dSMark Adams start = ba_d + idx; 14518f7e8f9dSMark Adams startj= bj_d + idx; 14528f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 14538f7e8f9dSMark Adams } 14548f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 14558f7e8f9dSMark Adams low = 0; 14568f7e8f9dSMark Adams high = (PetscInt)(end-start); 14578f7e8f9dSMark Adams while (high-low > 5) { 14588f7e8f9dSMark Adams int t = (low+high)/2; 14598f7e8f9dSMark Adams if (startj[t] > col) high = t; 14608f7e8f9dSMark Adams else low = t; 14618f7e8f9dSMark Adams } 14628f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 14638f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 14648f7e8f9dSMark Adams } 14658f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 146699551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 146799551766SMark Adams #endif 14688f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14698f7e8f9dSMark Adams } 14708f7e8f9dSMark Adams }); 14718f7e8f9dSMark Adams } 14728f7e8f9dSMark Adams }); 14738f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14748f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 14758f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 14768f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 14778f7e8f9dSMark Adams }); 14788f7e8f9dSMark Adams Kokkos::fence(); 14798f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 14809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 14818f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 14828f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 14838f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 14848f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14858f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 14868f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 14878f7e8f9dSMark Adams if (i < end) { 14888f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14898f7e8f9dSMark Adams *pv = 1.0/(*pv); 14908f7e8f9dSMark Adams } 14918f7e8f9dSMark Adams }); 14928f7e8f9dSMark Adams }); 14938f7e8f9dSMark Adams } 14949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol,&ic_h)); 14969566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow,&r_h)); 14978f7e8f9dSMark Adams 14989566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow,&row_identity)); 14999566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol,&col_identity)); 15008f7e8f9dSMark Adams if (b->inode.size) { 15018f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 15028f7e8f9dSMark Adams } else if (row_identity && col_identity) { 15038f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 15048f7e8f9dSMark Adams } else { 15058f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 15068f7e8f9dSMark Adams } 15078f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 15089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 15098f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 15108f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 15118f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 15128f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 15138f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 15148f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 15158f7e8f9dSMark Adams 1516930e68a5SMark Adams PetscFunctionReturn(0); 1517930e68a5SMark Adams } 1518930e68a5SMark Adams 151986a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1520930e68a5SMark Adams { 1521930e68a5SMark Adams PetscFunctionBegin; 15229566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 152386a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 152486a27549SJunchao Zhang PetscFunctionReturn(0); 152586a27549SJunchao Zhang } 152686a27549SJunchao Zhang 152786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 152886a27549SJunchao Zhang { 152986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 153086a27549SJunchao Zhang 153186a27549SJunchao Zhang PetscFunctionBegin; 153286a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 153386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 153486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 153586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 153686a27549SJunchao Zhang } 153786a27549SJunchao Zhang PetscFunctionReturn(0); 153886a27549SJunchao Zhang } 153986a27549SJunchao Zhang 154086a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 154186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 154286a27549SJunchao Zhang { 154386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1544076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 154586a27549SJunchao Zhang 154686a27549SJunchao Zhang PetscFunctionBegin; 154786a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 154886a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1549076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 155086a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1551076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1552076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 155386a27549SJunchao Zhang 155486a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1555076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1556076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1557076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 155886a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 155986a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 156086a27549SJunchao Zhang 156186a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 156286a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 156386a27549SJunchao Zhang */ 1564076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1565076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 156686a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 156786a27549SJunchao Zhang 156886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 156986a27549SJunchao Zhang 157086a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1571076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 157286a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1573076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1574076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 157586a27549SJunchao Zhang 157686a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1577076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1578076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1579076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 158086a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 158186a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 158286a27549SJunchao Zhang 158386a27549SJunchao Zhang /* Sort indices. See comments above */ 1584076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1585076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 158686a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 158786a27549SJunchao Zhang 158886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 158986a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 159086a27549SJunchao Zhang } 159186a27549SJunchao Zhang PetscFunctionReturn(0); 159286a27549SJunchao Zhang } 159386a27549SJunchao Zhang 159486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 159586a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 159686a27549SJunchao Zhang { 159786a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 159886a27549SJunchao Zhang PetscScalarKokkosView xv; 159986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 160086a27549SJunchao Zhang 160186a27549SJunchao Zhang PetscFunctionBegin; 16029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 16049566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b,&bv)); 16059566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x,&xv)); 160686a27549SJunchao Zhang /* Solve L tmpv = b */ 16079566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 160886a27549SJunchao Zhang /* Solve Ux = tmpv */ 16099566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 16109566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b,&bv)); 16119566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 16129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 161386a27549SJunchao Zhang PetscFunctionReturn(0); 161486a27549SJunchao Zhang } 161586a27549SJunchao Zhang 1616076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 161786a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 161886a27549SJunchao Zhang { 161986a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 162086a27549SJunchao Zhang PetscScalarKokkosView xv; 162186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 162286a27549SJunchao Zhang 162386a27549SJunchao Zhang PetscFunctionBegin; 16249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 16269566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b,&bv)); 16279566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x,&xv)); 162886a27549SJunchao Zhang /* Solve U^T tmpv = b */ 162986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 163086a27549SJunchao Zhang 163186a27549SJunchao Zhang /* Solve L^T x = tmpv */ 163286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 16339566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b,&bv)); 16349566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x,&xv)); 16359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 163686a27549SJunchao Zhang PetscFunctionReturn(0); 163786a27549SJunchao Zhang } 163886a27549SJunchao Zhang 163986a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 164086a27549SJunchao Zhang { 164186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 164286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 164386a27549SJunchao Zhang PetscInt fill_lev = info->levels; 164486a27549SJunchao Zhang 164586a27549SJunchao Zhang PetscFunctionBegin; 16469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1648076ba34aSJunchao Zhang 1649076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1650076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1651076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1652076ba34aSJunchao Zhang 1653076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 165486a27549SJunchao Zhang 165586a27549SJunchao Zhang B->assembled = PETSC_TRUE; 165686a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 165786a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 165886a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 165986a27549SJunchao Zhang B->ops->matsolve = NULL; 166086a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 166186a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 166286a27549SJunchao Zhang 166386a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 166486a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 166586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1666eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 16679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 166886a27549SJunchao Zhang PetscFunctionReturn(0); 166986a27549SJunchao Zhang } 167086a27549SJunchao Zhang 167186a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 167286a27549SJunchao Zhang { 167386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 167486a27549SJunchao Zhang Mat_SeqAIJ *b; 167586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 167686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 167786a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 167886a27549SJunchao Zhang PetscInt n = A->rmap->n; 167986a27549SJunchao Zhang 168086a27549SJunchao Zhang PetscFunctionBegin; 16819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 168286a27549SJunchao Zhang /* Rebuild factors */ 168386a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 168486a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 168586a27549SJunchao Zhang 168686a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 168786a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 168886a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 168986a27549SJunchao Zhang 169086a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 169186a27549SJunchao Zhang 169286a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 169386a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 169486a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 169586a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 169686a27549SJunchao Zhang 169786a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1698076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1699076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1700076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 170186a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 170286a27549SJunchao Zhang 170386a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 170486a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 170586a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 170686a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 170786a27549SJunchao Zhang 170886a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 170986a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 171086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 171186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 171286a27549SJunchao Zhang #else 171386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 171486a27549SJunchao Zhang #endif 171586a27549SJunchao Zhang 171686a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 171786a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 171886a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 171986a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 172086a27549SJunchao Zhang 172186a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 17229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL)); 172386a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 172486a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 172586a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 172686a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 172786a27549SJunchao Zhang 172886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 172986a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1730930e68a5SMark Adams PetscFunctionReturn(0); 1731930e68a5SMark Adams } 1732930e68a5SMark Adams 17338f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 17348f7e8f9dSMark Adams { 17358f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 17368f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1737930e68a5SMark Adams 17388f7e8f9dSMark Adams PetscFunctionBegin; 17399566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info)); 17408f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17418f7e8f9dSMark Adams // move B data into Kokkos 17429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 17439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 17448f7e8f9dSMark Adams { 17458f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 1746300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17478f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1748300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 1749300d22a6SJunchao Zhang Kokkos::deep_copy (baijkok->diag_d, h_diag); 17508f7e8f9dSMark Adams } 17518f7e8f9dSMark Adams } 17528f7e8f9dSMark Adams PetscFunctionReturn(0); 17538f7e8f9dSMark Adams } 17548f7e8f9dSMark Adams 175586a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1756930e68a5SMark Adams { 1757930e68a5SMark Adams PetscFunctionBegin; 1758930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1759930e68a5SMark Adams PetscFunctionReturn(0); 1760930e68a5SMark Adams } 1761930e68a5SMark Adams 17628f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 17638f7e8f9dSMark Adams { 17648f7e8f9dSMark Adams PetscFunctionBegin; 17658f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 17668f7e8f9dSMark Adams PetscFunctionReturn(0); 17678f7e8f9dSMark Adams } 17688f7e8f9dSMark Adams 1769930e68a5SMark Adams /*MC 177086a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 177186a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1772930e68a5SMark Adams 1773930e68a5SMark Adams Level: beginner 1774930e68a5SMark Adams 1775db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1776930e68a5SMark Adams M*/ 177786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1778930e68a5SMark Adams { 1779930e68a5SMark Adams PetscInt n = A->rmap->n; 1780930e68a5SMark Adams 1781930e68a5SMark Adams PetscFunctionBegin; 17829566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 17839566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 1784930e68a5SMark Adams (*B)->factortype = ftype; 17859566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 17869566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 1787930e68a5SMark Adams 17888f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17899566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 179086a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 179186a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 179286a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 17939566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 179486a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 179586a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 179698921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1797930e68a5SMark Adams 17989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 17999566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos)); 1800930e68a5SMark Adams PetscFunctionReturn(0); 1801930e68a5SMark Adams } 18028f7e8f9dSMark Adams 18038f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 18048f7e8f9dSMark Adams { 18058f7e8f9dSMark Adams PetscInt n = A->rmap->n; 18068f7e8f9dSMark Adams 18078f7e8f9dSMark Adams PetscFunctionBegin; 18089566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A),B)); 18099566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B,n,n,n,n)); 18108f7e8f9dSMark Adams (*B)->factortype = ftype; 1811f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 18129566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU])); 18139566063dSJacob Faibussowitsch PetscCall(MatSetType(*B,MATSEQAIJKOKKOS)); 18148f7e8f9dSMark Adams 18158f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18169566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B,A,A)); 18178f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 18188f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 18198f7e8f9dSMark Adams 18209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL)); 18219566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device)); 18228f7e8f9dSMark Adams PetscFunctionReturn(0); 18238f7e8f9dSMark Adams } 182486a27549SJunchao Zhang 182586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 182686a27549SJunchao Zhang { 182786a27549SJunchao Zhang PetscFunctionBegin; 18289566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos)); 18299566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos)); 18309566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device)); 183186a27549SJunchao Zhang PetscFunctionReturn(0); 183286a27549SJunchao Zhang } 183386a27549SJunchao Zhang 1834076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1835076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1836076ba34aSJunchao Zhang { 1837076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1838076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1839076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1840076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1841076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1842076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1843076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1844076ba34aSJunchao Zhang 1845076ba34aSJunchao Zhang PetscFunctionBegin; 18469566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz)); 1847076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 18489566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k)); 1849076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 185063a3b9bcSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],(double)PetscRealPart(a[p]))); 1851076ba34aSJunchao Zhang } 18529566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF,"\n")); 1853076ba34aSJunchao Zhang } 1854076ba34aSJunchao Zhang PetscFunctionReturn(0); 1855076ba34aSJunchao Zhang } 1856