111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 188c3ff71bSJunchao Zhang 19f98996d3SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 22*9371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 23f98996d3SJunchao Zhang #else 24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 26*9371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 27f98996d3SJunchao Zhang #endif 28f98996d3SJunchao Zhang 298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 308c3ff71bSJunchao Zhang 31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 34076ba34aSJunchao Zhang */ 35*9371c9d4SSatish Balay static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) { 36076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 37076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 388c3ff71bSJunchao Zhang 398c3ff71bSJunchao Zhang PetscFunctionBegin; 40076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 419566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 42076ba34aSJunchao Zhang 43076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 44076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 45076ba34aSJunchao Zhang 46076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 47076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 48076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 49076ba34aSJunchao Zhang */ 50076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 51076ba34aSJunchao Zhang delete aijkok; 52076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 53076ba34aSJunchao Zhang A->spptr = aijkok; 54076ba34aSJunchao Zhang } 55076ba34aSJunchao Zhang 565519a089SJose E. Roman if (aijkok->device_mat_d.data()) { 57a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 58a587d139SMark } 598c3ff71bSJunchao Zhang PetscFunctionReturn(0); 608c3ff71bSJunchao Zhang } 618c3ff71bSJunchao Zhang 6286a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 63*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) { 648c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 658c3ff71bSJunchao Zhang 668c3ff71bSJunchao Zhang PetscFunctionBegin; 675f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 685f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 69076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 70076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 71580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 738c3ff71bSJunchao Zhang } 748c3ff71bSJunchao Zhang PetscFunctionReturn(0); 758c3ff71bSJunchao Zhang } 768c3ff71bSJunchao Zhang 77076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 78*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) { 7986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8086a27549SJunchao Zhang 8186a27549SJunchao Zhang PetscFunctionBegin; 825f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8386a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8486a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8586a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 889566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 8986a27549SJunchao Zhang PetscFunctionReturn(0); 9086a27549SJunchao Zhang } 9186a27549SJunchao Zhang 92*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) { 93f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 94f0cf5187SStefano Zampini 95f0cf5187SStefano Zampini PetscFunctionBegin; 96f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9786a27549SJunchao Zhang /* We do not expect one needs factors on host */ 985f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 995f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 100076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 101f0cf5187SStefano Zampini PetscFunctionReturn(0); 102f0cf5187SStefano Zampini } 103f0cf5187SStefano Zampini 104*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 105076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 106f0cf5187SStefano Zampini 107f0cf5187SStefano Zampini PetscFunctionBegin; 1085519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1095519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1105519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1115519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1125519a089SJose E. Roman */ 1135519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 114076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 115076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 116076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 117076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 118076ba34aSJunchao Zhang } 119076ba34aSJunchao Zhang PetscFunctionReturn(0); 120076ba34aSJunchao Zhang } 121076ba34aSJunchao Zhang 122*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 123076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 124076ba34aSJunchao Zhang 125076ba34aSJunchao Zhang PetscFunctionBegin; 1265519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 127076ba34aSJunchao Zhang PetscFunctionReturn(0); 128076ba34aSJunchao Zhang } 129076ba34aSJunchao Zhang 130*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) { 131076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 132076ba34aSJunchao Zhang 133076ba34aSJunchao Zhang PetscFunctionBegin; 1345519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 135076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 136076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1372328674fSJunchao Zhang } else { 1382328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1392328674fSJunchao Zhang } 140076ba34aSJunchao Zhang PetscFunctionReturn(0); 141076ba34aSJunchao Zhang } 142076ba34aSJunchao Zhang 143*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) { 144076ba34aSJunchao Zhang PetscFunctionBegin; 145076ba34aSJunchao Zhang *array = NULL; 146076ba34aSJunchao Zhang PetscFunctionReturn(0); 147076ba34aSJunchao Zhang } 148076ba34aSJunchao Zhang 149*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 150076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 151076ba34aSJunchao Zhang 152076ba34aSJunchao Zhang PetscFunctionBegin; 1535519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 154076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1552328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1562328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1572328674fSJunchao Zhang } 158076ba34aSJunchao Zhang PetscFunctionReturn(0); 159076ba34aSJunchao Zhang } 160076ba34aSJunchao Zhang 161*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) { 162076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 163076ba34aSJunchao Zhang 164076ba34aSJunchao Zhang PetscFunctionBegin; 1655519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 166076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 167076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1682328674fSJunchao Zhang } 169f0cf5187SStefano Zampini PetscFunctionReturn(0); 170f0cf5187SStefano Zampini } 171f0cf5187SStefano Zampini 172*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) { 1737ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1747ee59b9bSJunchao Zhang 1757ee59b9bSJunchao Zhang PetscFunctionBegin; 1767ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1777ee59b9bSJunchao Zhang 1787ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1797ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1807ee59b9bSJunchao Zhang if (a) { 1817ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1827ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1837ee59b9bSJunchao Zhang } 1847ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1857ee59b9bSJunchao Zhang PetscFunctionReturn(0); 1867ee59b9bSJunchao Zhang } 1877ee59b9bSJunchao Zhang 188a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 189*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) { 190042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 191042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 192a587d139SMark 193a587d139SMark PetscFunctionBegin; 1945f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 195152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 196a587d139SMark Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 197a587d139SMark PetscFunctionReturn(0); 198a587d139SMark } 199a587d139SMark 200a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 201*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) { 202042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203a587d139SMark 204a587d139SMark PetscFunctionBegin; 205a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 206a587d139SMark *d_mat = aijkok->device_mat_d.data(); 207a587d139SMark } else { 2089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 209a587d139SMark *d_mat = NULL; 210a587d139SMark } 211a587d139SMark PetscFunctionReturn(0); 212a587d139SMark } 213076ba34aSJunchao Zhang 214076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 215*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) { 216152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 217152b3e56SJunchao Zhang 218152b3e56SJunchao Zhang PetscFunctionBegin; 2195f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 220076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 221076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 2229566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 223f98996d3SJunchao Zhang PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 224f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 22586a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 226076ba34aSJunchao Zhang } 227076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 228152b3e56SJunchao Zhang PetscFunctionReturn(0); 229152b3e56SJunchao Zhang } 230152b3e56SJunchao Zhang 231076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 232*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) { 233152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 234152b3e56SJunchao Zhang 235152b3e56SJunchao Zhang PetscFunctionBegin; 2369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2375f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 238076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 2399566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 240f98996d3SJunchao Zhang PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 241f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 242076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 243076ba34aSJunchao Zhang const auto &a = aijkok->csrmatH.values; 244*9371c9d4SSatish Balay Kokkos::parallel_for( 245*9371c9d4SSatish Balay a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 246076ba34aSJunchao Zhang #endif 24786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 248076ba34aSJunchao Zhang } 249076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 2509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 251152b3e56SJunchao Zhang PetscFunctionReturn(0); 252152b3e56SJunchao Zhang } 253a587d139SMark 2548c3ff71bSJunchao Zhang /* y = A x */ 255*9371c9d4SSatish Balay static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 2568c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 257152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 258152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2598c3ff71bSJunchao Zhang 2608c3ff71bSJunchao Zhang PetscFunctionBegin; 2619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2639566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 2649566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 2658c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 266152b3e56SJunchao Zhang KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A x + beta y */ 2679566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 2689566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 269076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 2709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 2719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2728c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2738c3ff71bSJunchao Zhang } 2748c3ff71bSJunchao Zhang 2758c3ff71bSJunchao Zhang /* y = A^T x */ 276*9371c9d4SSatish Balay static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 2778c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 278152b3e56SJunchao Zhang const char *mode; 279152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 280152b3e56SJunchao Zhang PetscScalarKokkosView yv; 281076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2828c3ff71bSJunchao Zhang 2838c3ff71bSJunchao Zhang PetscFunctionBegin; 2849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2869566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 2879566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 288152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 2899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 290152b3e56SJunchao Zhang mode = "N"; 291152b3e56SJunchao Zhang } else { 292076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 293076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 294152b3e56SJunchao Zhang mode = "T"; 295152b3e56SJunchao Zhang } 296076ba34aSJunchao Zhang KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^T x + beta y */ 2979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 2989566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 2999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 3009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3018c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3028c3ff71bSJunchao Zhang } 3038c3ff71bSJunchao Zhang 3048c3ff71bSJunchao Zhang /* y = A^H x */ 305*9371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) { 3068c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 307152b3e56SJunchao Zhang const char *mode; 308152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 309152b3e56SJunchao Zhang PetscScalarKokkosView yv; 310076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3118c3ff71bSJunchao Zhang 3128c3ff71bSJunchao Zhang PetscFunctionBegin; 3139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3159566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3169566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 317152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 319152b3e56SJunchao Zhang mode = "N"; 320152b3e56SJunchao Zhang } else { 321076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 322076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 323152b3e56SJunchao Zhang mode = "C"; 324152b3e56SJunchao Zhang } 325076ba34aSJunchao Zhang KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv); /* y = alpha A^H x + beta y */ 3269566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3279566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 3299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3308c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3318c3ff71bSJunchao Zhang } 3328c3ff71bSJunchao Zhang 3338c3ff71bSJunchao Zhang /* z = A x + y */ 334*9371c9d4SSatish Balay static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 3358c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 336152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 337152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3388c3ff71bSJunchao Zhang 3398c3ff71bSJunchao Zhang PetscFunctionBegin; 3409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3429566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3439566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 3449566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 3458c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 3468c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 347152b3e56SJunchao Zhang KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A x + beta z */ 3489566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3499566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 3509566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 3519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3538c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3548c3ff71bSJunchao Zhang } 3558c3ff71bSJunchao Zhang 3568c3ff71bSJunchao Zhang /* z = A^T x + y */ 357*9371c9d4SSatish Balay static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 3588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 359152b3e56SJunchao Zhang const char *mode; 360152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 361152b3e56SJunchao Zhang PetscScalarKokkosView zv; 362076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3638c3ff71bSJunchao Zhang 3648c3ff71bSJunchao Zhang PetscFunctionBegin; 3659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3679566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3689566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 3699566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 3708c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 371152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 373152b3e56SJunchao Zhang mode = "N"; 374152b3e56SJunchao Zhang } else { 375076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 376076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 377152b3e56SJunchao Zhang mode = "T"; 378152b3e56SJunchao Zhang } 379076ba34aSJunchao Zhang KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^T x + beta z */ 3809566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3819566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 3829566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 3839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 3849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3858c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3868c3ff71bSJunchao Zhang } 3878c3ff71bSJunchao Zhang 3888c3ff71bSJunchao Zhang /* z = A^H x + y */ 389*9371c9d4SSatish Balay static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) { 3908c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 391152b3e56SJunchao Zhang const char *mode; 392152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 393152b3e56SJunchao Zhang PetscScalarKokkosView zv; 394076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3958c3ff71bSJunchao Zhang 3968c3ff71bSJunchao Zhang PetscFunctionBegin; 3979566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3999566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4009566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4019566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4028c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 403152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 405152b3e56SJunchao Zhang mode = "N"; 406152b3e56SJunchao Zhang } else { 407076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 408076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 409152b3e56SJunchao Zhang mode = "C"; 410152b3e56SJunchao Zhang } 411076ba34aSJunchao Zhang KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv); /* z = alpha A^H x + beta z */ 4129566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4139566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4149566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 4169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 417152b3e56SJunchao Zhang PetscFunctionReturn(0); 418152b3e56SJunchao Zhang } 419152b3e56SJunchao Zhang 420*9371c9d4SSatish Balay PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) { 421152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 422152b3e56SJunchao Zhang 423152b3e56SJunchao Zhang PetscFunctionBegin; 424152b3e56SJunchao Zhang switch (op) { 425152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 426152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 4279566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 428152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 429152b3e56SJunchao Zhang break; 430*9371c9d4SSatish Balay default: PetscCall(MatSetOption_SeqAIJ(A, op, flg)); break; 431152b3e56SJunchao Zhang } 4328c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4338c3ff71bSJunchao Zhang } 4348c3ff71bSJunchao Zhang 435076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 436*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) { 437076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4388c3ff71bSJunchao Zhang 4398c3ff71bSJunchao Zhang PetscFunctionBegin; 4409566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 441076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 4429566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 4438c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 4449566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 445076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4465f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 4479566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4489566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 4499566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 4509566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 451076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 452394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4535f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 454076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 4558c3ff71bSJunchao Zhang } 456076ba34aSJunchao Zhang } 4578c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4588c3ff71bSJunchao Zhang } 4598c3ff71bSJunchao Zhang 460076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 461076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 462076ba34aSJunchao Zhang */ 463*9371c9d4SSatish Balay static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) { 464076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 465076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 466076ba34aSJunchao Zhang Mat mat; 4678c3ff71bSJunchao Zhang 4688c3ff71bSJunchao Zhang PetscFunctionBegin; 469076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 4709566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 471076ba34aSJunchao Zhang mat = *B; 472076ba34aSJunchao Zhang if (A->assembled) { 473076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 474076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 475076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 476076ba34aSJunchao Zhang /* Now copy values to B if needed */ 477076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 478076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 479076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 480076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 481076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 482076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 483076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 484076ba34aSJunchao Zhang } 485076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 486076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 487076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 488076ba34aSJunchao Zhang } 489076ba34aSJunchao Zhang mat->spptr = bkok; 490076ba34aSJunchao Zhang } 491076ba34aSJunchao Zhang 4929566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 4939566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 4949566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 4959566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 4968c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4978c3ff71bSJunchao Zhang } 4988c3ff71bSJunchao Zhang 499*9371c9d4SSatish Balay static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) { 5000ecb592aSJunchao Zhang Mat At; 501ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5020ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 5030ecb592aSJunchao Zhang 5040ecb592aSJunchao Zhang PetscFunctionBegin; 5057fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 5069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 5070ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 508ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 5099566063dSJacob Faibussowitsch PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 5109566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 5110ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5129566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 5130ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5140ecb592aSJunchao Zhang if ((*B)->assembled) { 5150ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 5169566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 5179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 5180ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5190ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 5200ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 5210ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 5229566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 5239566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 5240ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 5250ecb592aSJunchao Zhang } 5260ecb592aSJunchao Zhang PetscFunctionReturn(0); 5270ecb592aSJunchao Zhang } 5280ecb592aSJunchao Zhang 529*9371c9d4SSatish Balay static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) { 53086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5318c3ff71bSJunchao Zhang 5328c3ff71bSJunchao Zhang PetscFunctionBegin; 53386a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 53486a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5358c3ff71bSJunchao Zhang delete aijkok; 53686a27549SJunchao Zhang } else { 53786a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 53886a27549SJunchao Zhang } 539cbc6b225SStefano Zampini A->spptr = NULL; 5409566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 5419566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 5429566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 5439566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 5448c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5458c3ff71bSJunchao Zhang } 5468c3ff71bSJunchao Zhang 5473f3ba80aSJunchao Zhang /*MC 5483f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5493f3ba80aSJunchao Zhang 5503f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5513f3ba80aSJunchao Zhang 5523f3ba80aSJunchao Zhang Options Database Keys: 5533f3ba80aSJunchao Zhang . -mat_type aijkokkos - sets the matrix type to "aijkokkos" during a call to MatSetFromOptions() 5543f3ba80aSJunchao Zhang 5553f3ba80aSJunchao Zhang Level: beginner 5563f3ba80aSJunchao Zhang 557db781477SPatrick Sanan .seealso: `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 5583f3ba80aSJunchao Zhang M*/ 559*9371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) { 56086a27549SJunchao Zhang PetscFunctionBegin; 5619566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 5629566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 5639566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 56486a27549SJunchao Zhang PetscFunctionReturn(0); 56586a27549SJunchao Zhang } 56686a27549SJunchao Zhang 567076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 568*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) { 569076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 570076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 571076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 572076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 573076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 574076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 575a3f881fbSStefano Zampini 576a3f881fbSStefano Zampini PetscFunctionBegin; 577076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 578076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 579076ba34aSJunchao Zhang PetscValidPointer(C, 4); 580076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 581076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 5825f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 5835f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 584076ba34aSJunchao Zhang 5859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 5869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 587076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 588076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 589076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 590076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 591076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 592076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 593076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 594076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 595076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 596076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 597076ba34aSJunchao Zhang nnz = a->nz + b->nz; 598076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 599076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 600076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 601076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 602076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 603076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 604076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 605076ba34aSJunchao Zhang ca = ca_dual.view_device(); 606076ba34aSJunchao Zhang ci = ci_dual.view_device(); 607076ba34aSJunchao Zhang cj = cj_dual.view_device(); 608076ba34aSJunchao Zhang 609076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 610*9371c9d4SSatish Balay Kokkos::parallel_for( 611*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 612076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 613076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 614076ba34aSJunchao Zhang 615076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 616076ba34aSJunchao Zhang ci(i) = coffset; 617076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 618076ba34aSJunchao Zhang }); 619076ba34aSJunchao Zhang 620076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 621076ba34aSJunchao Zhang if (k < alen) { 622076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 623076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 624076ba34aSJunchao Zhang } else { 625076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 626076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 627076ba34aSJunchao Zhang } 628076ba34aSJunchao Zhang }); 629076ba34aSJunchao Zhang }); 630076ba34aSJunchao Zhang ca_dual.modify_device(); 631076ba34aSJunchao Zhang ci_dual.modify_device(); 632076ba34aSJunchao Zhang cj_dual.modify_device(); 6339566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 6349566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 635076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 636076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 637076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 638076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 639076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 640076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 641076ba34aSJunchao Zhang 642*9371c9d4SSatish Balay Kokkos::parallel_for( 643*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 644076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 645076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 646076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 647076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 648076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 649076ba34aSJunchao Zhang }); 650076ba34aSJunchao Zhang }); 6519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 652076ba34aSJunchao Zhang } 653076ba34aSJunchao Zhang PetscFunctionReturn(0); 654076ba34aSJunchao Zhang } 655076ba34aSJunchao Zhang 656*9371c9d4SSatish Balay static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) { 657076ba34aSJunchao Zhang PetscFunctionBegin; 658076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 659a3f881fbSStefano Zampini PetscFunctionReturn(0); 660a3f881fbSStefano Zampini } 661a3f881fbSStefano Zampini 662*9371c9d4SSatish Balay static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) { 663a3f881fbSStefano Zampini Mat_Product *product = C->product; 664a3f881fbSStefano Zampini Mat A, B; 665076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 666a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 667a3f881fbSStefano Zampini Mat_SeqAIJ *c; 668076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 669076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA, *csrmatB; 670a3f881fbSStefano Zampini 671a3f881fbSStefano Zampini PetscFunctionBegin; 672a3f881fbSStefano Zampini MatCheckProduct(C, 1); 6735f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 674076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 675076ba34aSJunchao Zhang 676076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 677076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 678076ba34aSJunchao Zhang PetscFunctionReturn(0); 679076ba34aSJunchao Zhang } 680076ba34aSJunchao Zhang 681076ba34aSJunchao Zhang switch (product->type) { 682*9371c9d4SSatish Balay case MATPRODUCT_AB: 683*9371c9d4SSatish Balay transA = false; 684*9371c9d4SSatish Balay transB = false; 685*9371c9d4SSatish Balay break; 686*9371c9d4SSatish Balay case MATPRODUCT_AtB: 687*9371c9d4SSatish Balay transA = true; 688*9371c9d4SSatish Balay transB = false; 689*9371c9d4SSatish Balay break; 690*9371c9d4SSatish Balay case MATPRODUCT_ABt: 691*9371c9d4SSatish Balay transA = false; 692*9371c9d4SSatish Balay transB = true; 693*9371c9d4SSatish Balay break; 694*9371c9d4SSatish Balay default: SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 695076ba34aSJunchao Zhang } 696076ba34aSJunchao Zhang 697a3f881fbSStefano Zampini A = product->A; 698a3f881fbSStefano Zampini B = product->B; 6999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 701a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 702a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 703a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 704076ba34aSJunchao Zhang 7055f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 706076ba34aSJunchao Zhang 707076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 708076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 709076ba34aSJunchao Zhang 710076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 711076ba34aSJunchao Zhang if (transA) { 7129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 713076ba34aSJunchao Zhang transA = false; 714a3f881fbSStefano Zampini } 715a3f881fbSStefano Zampini 716076ba34aSJunchao Zhang if (transB) { 7179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 718076ba34aSJunchao Zhang transB = false; 719076ba34aSJunchao Zhang } 7209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 7219566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 722f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 7239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 7249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 725a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 726a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 7279566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 7289566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 7299566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 730a3f881fbSStefano Zampini c->reallocs = 0; 731076ba34aSJunchao Zhang C->info.mallocs = 0; 732a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 733a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 734a3f881fbSStefano Zampini C->num_ass++; 735a3f881fbSStefano Zampini PetscFunctionReturn(0); 736a3f881fbSStefano Zampini } 737a3f881fbSStefano Zampini 738*9371c9d4SSatish Balay static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) { 739076ba34aSJunchao Zhang Mat_Product *product = C->product; 740076ba34aSJunchao Zhang MatProductType ptype; 741076ba34aSJunchao Zhang Mat A, B; 742076ba34aSJunchao Zhang bool transA, transB; 743076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 744076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 745076ba34aSJunchao Zhang MPI_Comm comm; 746076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 747a3f881fbSStefano Zampini 748a3f881fbSStefano Zampini PetscFunctionBegin; 749a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7509566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 7515f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 752a3f881fbSStefano Zampini A = product->A; 753a3f881fbSStefano Zampini B = product->B; 7549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 756a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 757a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 758076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 759076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 760076ba34aSJunchao Zhang 761a3f881fbSStefano Zampini ptype = product->type; 762a3f881fbSStefano Zampini switch (ptype) { 763*9371c9d4SSatish Balay case MATPRODUCT_AB: 764*9371c9d4SSatish Balay transA = false; 765*9371c9d4SSatish Balay transB = false; 766*9371c9d4SSatish Balay break; 767*9371c9d4SSatish Balay case MATPRODUCT_AtB: 768*9371c9d4SSatish Balay transA = true; 769*9371c9d4SSatish Balay transB = false; 770*9371c9d4SSatish Balay break; 771*9371c9d4SSatish Balay case MATPRODUCT_ABt: 772*9371c9d4SSatish Balay transA = false; 773*9371c9d4SSatish Balay transB = true; 774*9371c9d4SSatish Balay break; 775*9371c9d4SSatish Balay default: SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 776a3f881fbSStefano Zampini } 777a3f881fbSStefano Zampini 778076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 779076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 780076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 781076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 782a3f881fbSStefano Zampini 783076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 784076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7856ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7866ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7876ffb9508SJunchao Zhang */ 788076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 789076ba34aSJunchao Zhang 7909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 791076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 792076ba34aSJunchao Zhang if (transA) { 7939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 794076ba34aSJunchao Zhang transA = false; 795076ba34aSJunchao Zhang } 796076ba34aSJunchao Zhang 797076ba34aSJunchao Zhang if (transB) { 7989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 799076ba34aSJunchao Zhang transB = false; 800076ba34aSJunchao Zhang } 801076ba34aSJunchao Zhang 8029566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 803076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 804076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 805076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 806076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 807076ba34aSJunchao Zhang */ 8089566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 809f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(csrmatC)); 8109566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 811076ba34aSJunchao Zhang 8129566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 8139566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 814076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 815a3f881fbSStefano Zampini PetscFunctionReturn(0); 816a3f881fbSStefano Zampini } 817a3f881fbSStefano Zampini 818a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 819*9371c9d4SSatish Balay static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) { 820076ba34aSJunchao Zhang Mat_Product *product = mat->product; 821a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 822a3f881fbSStefano Zampini 823a3f881fbSStefano Zampini PetscFunctionBegin; 824a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 8259566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 826*9371c9d4SSatish Balay if (product->type == MATPRODUCT_ABC) { PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); } 827a3f881fbSStefano Zampini if (Biskok && Ciskok) { 828a3f881fbSStefano Zampini switch (product->type) { 829a3f881fbSStefano Zampini case MATPRODUCT_AB: 830a3f881fbSStefano Zampini case MATPRODUCT_AtB: 831*9371c9d4SSatish Balay case MATPRODUCT_ABt: mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; break; 832a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 833a3f881fbSStefano Zampini case MATPRODUCT_RARt: 834*9371c9d4SSatish Balay case MATPRODUCT_ABC: mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; break; 835*9371c9d4SSatish Balay default: break; 836a3f881fbSStefano Zampini } 837a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 8389566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 839a3f881fbSStefano Zampini } 840a3f881fbSStefano Zampini PetscFunctionReturn(0); 841a3f881fbSStefano Zampini } 842a587d139SMark 843*9371c9d4SSatish Balay static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) { 844f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 845f0cf5187SStefano Zampini 846f0cf5187SStefano Zampini PetscFunctionBegin; 8479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 849f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 850076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 8519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8529566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 8539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 854f0cf5187SStefano Zampini PetscFunctionReturn(0); 855f0cf5187SStefano Zampini } 856f0cf5187SStefano Zampini 857*9371c9d4SSatish Balay static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) { 858076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 859a587d139SMark 860a587d139SMark PetscFunctionBegin; 861076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8622328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 863076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 8649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 8652328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8669566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 8672328674fSJunchao Zhang } 868a587d139SMark PetscFunctionReturn(0); 869a587d139SMark } 870a587d139SMark 871*9371c9d4SSatish Balay static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) { 872f78ce678SMark Adams Mat_SeqAIJ *aijseq; 873f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 874f78ce678SMark Adams PetscInt n; 875f78ce678SMark Adams PetscScalarKokkosView xv; 876f78ce678SMark Adams 877f78ce678SMark Adams PetscFunctionBegin; 878f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 879f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 880f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 881f78ce678SMark Adams 882f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 883f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 884f78ce678SMark Adams 885f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 886f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 887f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ *>(A->data); 888f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 889f78ce678SMark Adams } 890f78ce678SMark Adams 891f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 892f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 893f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 894f78ce678SMark Adams 895f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 896*9371c9d4SSatish Balay Kokkos::parallel_for( 897*9371c9d4SSatish Balay n, KOKKOS_LAMBDA(const PetscInt i) { 898f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 899f78ce678SMark Adams else xv(i) = 0; 900f78ce678SMark Adams }); 901f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 902f78ce678SMark Adams PetscFunctionReturn(0); 903f78ce678SMark Adams } 904f78ce678SMark Adams 905db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 906*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) { 907db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 908db78de30SJunchao Zhang 909db78de30SJunchao Zhang PetscFunctionBegin; 910db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 911db78de30SJunchao Zhang PetscValidPointer(kv, 2); 912db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 914db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 915076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 916db78de30SJunchao Zhang PetscFunctionReturn(0); 917db78de30SJunchao Zhang } 918db78de30SJunchao Zhang 919*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) { 920db78de30SJunchao Zhang PetscFunctionBegin; 921db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 922db78de30SJunchao Zhang PetscValidPointer(kv, 2); 923db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 924db78de30SJunchao Zhang PetscFunctionReturn(0); 925db78de30SJunchao Zhang } 926db78de30SJunchao Zhang 927*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) { 928db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 929db78de30SJunchao Zhang 930db78de30SJunchao Zhang PetscFunctionBegin; 931db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 932db78de30SJunchao Zhang PetscValidPointer(kv, 2); 933db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 935db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 936076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 937db78de30SJunchao Zhang PetscFunctionReturn(0); 938db78de30SJunchao Zhang } 939db78de30SJunchao Zhang 940*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) { 941db78de30SJunchao Zhang PetscFunctionBegin; 942db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 943db78de30SJunchao Zhang PetscValidPointer(kv, 2); 944db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 946db78de30SJunchao Zhang PetscFunctionReturn(0); 947db78de30SJunchao Zhang } 948db78de30SJunchao Zhang 949*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) { 950db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 951db78de30SJunchao Zhang 952db78de30SJunchao Zhang PetscFunctionBegin; 953db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 954db78de30SJunchao Zhang PetscValidPointer(kv, 2); 955db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 956db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 957076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 958db78de30SJunchao Zhang PetscFunctionReturn(0); 959db78de30SJunchao Zhang } 960db78de30SJunchao Zhang 961*9371c9d4SSatish Balay PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) { 962db78de30SJunchao Zhang PetscFunctionBegin; 963db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 964db78de30SJunchao Zhang PetscValidPointer(kv, 2); 965db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 967db78de30SJunchao Zhang PetscFunctionReturn(0); 968db78de30SJunchao Zhang } 969db78de30SJunchao Zhang 970c17cf699SJunchao Zhang /* Computes Y += alpha X */ 971*9371c9d4SSatish Balay static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) { 972a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 973c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 974c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 975c17cf699SJunchao Zhang MatScalarKokkosView Ya; 976a587d139SMark 977a587d139SMark PetscFunctionBegin; 978c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 979c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 9809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 9819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 9829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 983db78de30SJunchao Zhang 984c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 985c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 986a587d139SMark PetscBool e; 9879566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 988a587d139SMark if (e) { 9899566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 990c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 991a587d139SMark } 992a587d139SMark } 993db78de30SJunchao Zhang 994c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 995c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 996c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 997c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 998c17cf699SJunchao Zhang */ 999c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1000c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1001c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1002c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1003c17cf699SJunchao Zhang 1004c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1005c17cf699SJunchao Zhang KokkosBlas::axpy(alpha, Xa, Ya); 10069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1007c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1008c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1009c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1010c17cf699SJunchao Zhang 1011*9371c9d4SSatish Balay Kokkos::parallel_for( 1012*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1013c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1014c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1015c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 1016c17cf699SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1017c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1018c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1019c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1020c17cf699SJunchao Zhang q++; 1021a587d139SMark } else { 1022c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1023c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1024c17cf699SJunchao Zhang */ 1025*9371c9d4SSatish Balay if (Yi(i) != Yi(i + 1)) 1026*9371c9d4SSatish Balay Ya(Yi(i)) = 10278b8b16f9SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 10288b8b16f9SJunchao Zhang Kokkos::nan("1"); /* auto promote the double NaN if needed */ 10298b8b16f9SJunchao Zhang #else 10308b8b16f9SJunchao Zhang Kokkos::Experimental::nan("1"); 10318b8b16f9SJunchao Zhang #endif 1032a587d139SMark } 1033c17cf699SJunchao Zhang } 1034c17cf699SJunchao Zhang }); 1035c17cf699SJunchao Zhang }); 10369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1037c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1038c17cf699SJunchao Zhang Mat Z; 1039c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1040c17cf699SJunchao Zhang KernelHandle kh; 1041c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1042c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1043c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1044c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 10459566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 10469566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1047c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1048c17cf699SJunchao Zhang } 10499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 1051a587d139SMark PetscFunctionReturn(0); 1052a587d139SMark } 1053a587d139SMark 1054*9371c9d4SSatish Balay static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) { 105542550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 105642550becSJunchao Zhang Mat_SeqAIJ *aseq; 105742550becSJunchao Zhang 105842550becSJunchao Zhang PetscFunctionBegin; 10599566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1060394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 106142550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1062cbc6b225SStefano Zampini delete akok; 1063cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 10649566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1065394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 106642550becSJunchao Zhang PetscFunctionReturn(0); 106742550becSJunchao Zhang } 106842550becSJunchao Zhang 1069*9371c9d4SSatish Balay static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) { 107042550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 107142550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1072394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1073394ed5ebSJunchao Zhang const PetscCountKokkosView &jmap = akok->jmap_d; 1074394ed5ebSJunchao Zhang const PetscCountKokkosView &perm = akok->perm_d; 107542550becSJunchao Zhang MatScalarKokkosView Aa; 107642550becSJunchao Zhang ConstMatScalarKokkosView kv; 107742550becSJunchao Zhang PetscMemType memtype; 107842550becSJunchao Zhang 107942550becSJunchao Zhang PetscFunctionBegin; 10809566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 108142550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1082394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 108342550becSJunchao Zhang } else { 1084394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 108542550becSJunchao Zhang } 108642550becSJunchao Zhang 1087c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1088c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 108942550becSJunchao Zhang 1090*9371c9d4SSatish Balay Kokkos::parallel_for( 1091*9371c9d4SSatish Balay Annz, KOKKOS_LAMBDA(const PetscCount i) { 1092c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1093c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1094c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1095c7b718f4SJunchao Zhang }); 1096394ed5ebSJunchao Zhang 10979566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 10989566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 109942550becSJunchao Zhang PetscFunctionReturn(0); 110042550becSJunchao Zhang } 110142550becSJunchao Zhang 1102*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) { 11035fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11045fbaff96SJunchao Zhang MatScalarKokkosView Aa; 11055fbaff96SJunchao Zhang const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 11065fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 11075fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 11085fbaff96SJunchao Zhang 11095fbaff96SJunchao Zhang PetscFunctionBegin; 11105fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 1111*9371c9d4SSatish Balay Kokkos::parallel_for( 1112*9371c9d4SSatish Balay m, KOKKOS_LAMBDA(const PetscInt i) { 11135fbaff96SJunchao Zhang PetscScalar tmp; 11145fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 11155fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 11165fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 11175fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 11185fbaff96SJunchao Zhang } 11195fbaff96SJunchao Zhang }); 11205fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 11215fbaff96SJunchao Zhang PetscFunctionReturn(0); 11225fbaff96SJunchao Zhang } 11235fbaff96SJunchao Zhang 1124*9371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) { 11258f7e8f9dSMark Adams PetscFunctionBegin; 11269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 11279566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 11288f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 11298f7e8f9dSMark Adams PetscFunctionReturn(0); 11308f7e8f9dSMark Adams } 11318f7e8f9dSMark Adams 1132*9371c9d4SSatish Balay static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) { 1133076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1134076ba34aSJunchao Zhang 11358c3ff71bSJunchao Zhang PetscFunctionBegin; 1136076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 11376f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 11386f3d89d0SStefano Zampini 11398c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 11408c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 11418c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1142a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1143f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1144a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1145076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 11468c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 11478c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 11488c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 11498c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 11508c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 11518c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1152076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 11530ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1154152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1155f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1156076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1157076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1158076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1159076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1160076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1161076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 11627ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 116342550becSJunchao Zhang 11649566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 11659566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 1166076ba34aSJunchao Zhang PetscFunctionReturn(0); 1167076ba34aSJunchao Zhang } 1168076ba34aSJunchao Zhang 1169*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) { 1170076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1171076ba34aSJunchao Zhang PetscInt i, m, n; 1172076ba34aSJunchao Zhang 1173076ba34aSJunchao Zhang PetscFunctionBegin; 11745f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1175076ba34aSJunchao Zhang 1176076ba34aSJunchao Zhang m = akok->nrows(); 1177076ba34aSJunchao Zhang n = akok->ncols(); 11789566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 11799566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1180076ba34aSJunchao Zhang 1181076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 11829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1183076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ *)(A)->data; 1184076ba34aSJunchao Zhang 1185076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1186076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1187076ba34aSJunchao Zhang 1188076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1189076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1190076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1191076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1192076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1193076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1194076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1195076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1196076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1197076ba34aSJunchao Zhang 11989566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 11999566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1200*9371c9d4SSatish Balay for (i = 0; i < m; i++) { aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; } 1201076ba34aSJunchao Zhang 1202076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1203076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1204ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 12059566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 12069566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 1207076ba34aSJunchao Zhang PetscFunctionReturn(0); 1208076ba34aSJunchao Zhang } 1209076ba34aSJunchao Zhang 1210076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1211076ba34aSJunchao Zhang 1212076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1213076ba34aSJunchao Zhang */ 1214*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) { 1215076ba34aSJunchao Zhang PetscFunctionBegin; 12169566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 12179566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 12188c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12198c3ff71bSJunchao Zhang } 12208c3ff71bSJunchao Zhang 12218c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1222152b3e56SJunchao Zhang /*@C 12238c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 12248c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12258c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 12268c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12278c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12288c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 12298c3ff71bSJunchao Zhang 12308c3ff71bSJunchao Zhang Collective 12318c3ff71bSJunchao Zhang 12328c3ff71bSJunchao Zhang Input Parameters: 12338c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 12348c3ff71bSJunchao Zhang . m - number of rows 12358c3ff71bSJunchao Zhang . n - number of columns 12368c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 12378c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 12388c3ff71bSJunchao Zhang (possibly different for each row) or NULL 12398c3ff71bSJunchao Zhang 12408c3ff71bSJunchao Zhang Output Parameter: 12418c3ff71bSJunchao Zhang . A - the matrix 12428c3ff71bSJunchao Zhang 12438c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 12448c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 12458c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 12468c3ff71bSJunchao Zhang 12478c3ff71bSJunchao Zhang Notes: 12488c3ff71bSJunchao Zhang If nnz is given then nz is ignored 12498c3ff71bSJunchao Zhang 12508c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 12518c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 12528c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 12538c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 12548c3ff71bSJunchao Zhang 12558c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 12568c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 12578c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 12588c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 12598c3ff71bSJunchao Zhang 12608c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 12618c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 12628c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 12638c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 12648c3ff71bSJunchao Zhang 12658c3ff71bSJunchao Zhang Level: intermediate 12668c3ff71bSJunchao Zhang 1267db781477SPatrick Sanan .seealso: `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 12688c3ff71bSJunchao Zhang @*/ 1269*9371c9d4SSatish Balay PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) { 12708c3ff71bSJunchao Zhang PetscFunctionBegin; 12719566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 12729566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 12739566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 12749566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 12759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 12768c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12778c3ff71bSJunchao Zhang } 1278930e68a5SMark Adams 12798f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 12808f7e8f9dSMark Adams // 128146804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 12828f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 12838f7e8f9dSMark Adams // 1284*9371c9d4SSatish Balay static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) { 12858f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 12868f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 12878f7e8f9dSMark Adams IS isrow = b->row, isicol = b->icol; 12888f7e8f9dSMark Adams const PetscInt *r_h, *ic_h; 1289300d22a6SJunchao Zhang const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1290076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1291076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 12928f7e8f9dSMark Adams PetscBool row_identity, col_identity; 129346804e07SMark Adams PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1294930e68a5SMark Adams 1295930e68a5SMark Adams PetscFunctionBegin; 12962c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1297b94d7dedSBarry Smith PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 12982c71b3e2SJacob Faibussowitsch PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 12999566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r_h)); 13009566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic_h)); 13019566063dSJacob Faibussowitsch PetscCall(ISGetSize(isicol, &nc)); 13029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 13039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13048f7e8f9dSMark Adams { 13058f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 13068f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 13078f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 13088f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 13098f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 13108f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 13118f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 13128f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 13138f7e8f9dSMark Adams size_t flops_h = 0.0; 13148f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 13158f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k("flops"); 13168f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 13178f7e8f9dSMark Adams const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 13188f7e8f9dSMark Adams Kokkos::deep_copy(d_flops_k, h_flops_k); 13198f7e8f9dSMark Adams Kokkos::deep_copy(d_r_k, h_r_k); 13208f7e8f9dSMark Adams Kokkos::deep_copy(d_ic_k, h_ic_k); 13218f7e8f9dSMark Adams // Fill A --> fact 1322*9371c9d4SSatish Balay Kokkos::parallel_for( 1323*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1324042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 13258f7e8f9dSMark Adams const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 13268f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13278f7e8f9dSMark Adams // zero rows of B 13288f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 13298f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 13308f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 13318f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 13328f7e8f9dSMark Adams /* zero (unfactored row) */ 13338f7e8f9dSMark Adams for (int j = 0; j < nzbL; j++) baL[j] = 0; 13348f7e8f9dSMark Adams for (int j = 0; j < nzbU; j++) baU[j] = 0; 13358f7e8f9dSMark Adams }); 13368f7e8f9dSMark Adams // copy A into B 13378f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 13388f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 13398f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 13408f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 13418f7e8f9dSMark Adams /* load in initial (unfactored row) */ 13428f7e8f9dSMark Adams for (int j = 0; j < nza; j++) { 13438f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 13448f7e8f9dSMark Adams PetscScalar vala = av[j]; 13458f7e8f9dSMark Adams if (colb == rowb) { 13468f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 13478f7e8f9dSMark Adams } else { 13488f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 13498f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 13508f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 13518f7e8f9dSMark Adams for (int j = 0; j < nz; j++) { 13528f7e8f9dSMark Adams if (pbj[j] == colb) { 13538f7e8f9dSMark Adams pba[j] = vala; 13548f7e8f9dSMark Adams set++; 13558f7e8f9dSMark Adams break; 13568f7e8f9dSMark Adams } 13578f7e8f9dSMark Adams } 13588f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 13598f7e8f9dSMark Adams if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 13608f1da0b2SJunchao Zhang #endif 13618f7e8f9dSMark Adams } 13628f7e8f9dSMark Adams } 13638f7e8f9dSMark Adams }); 13648f7e8f9dSMark Adams }); 13658f7e8f9dSMark Adams Kokkos::fence(); 1366930e68a5SMark Adams 1367*9371c9d4SSatish Balay Kokkos::parallel_for( 1368*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 13698f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13708f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 13718f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1372042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 13738f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc; 13748f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 13758f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 13768f7e8f9dSMark Adams for (int ii = start; ii < end - 1; ii++) { 13778f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 13788f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 13798f7e8f9dSMark Adams const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 13808f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 13818f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 13828f7e8f9dSMark Adams PetscInt kIdx = j * Ni + field_block_idx; 1383*9371c9d4SSatish Balay if (kIdx >= nzUi) /* void */ 1384*9371c9d4SSatish Balay ; 13858f7e8f9dSMark Adams else { 13868f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 13878f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 13888f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 13898f7e8f9dSMark Adams size_t st_idx; 13908f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 13918f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 13928f7e8f9dSMark Adams // get column, there has got to be a better way 1393*9371c9d4SSatish Balay Kokkos::parallel_reduce( 1394*9371c9d4SSatish Balay Kokkos::ThreadVectorRange(team, nzL), 1395*9371c9d4SSatish Balay [&](const int &j, size_t &idx) { 13968f7e8f9dSMark Adams if (pjL[j] == ii) { 13978f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13988f7e8f9dSMark Adams idx = j; // output 13998f7e8f9dSMark Adams *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 14008f7e8f9dSMark Adams } 1401*9371c9d4SSatish Balay }, 1402*9371c9d4SSatish Balay st_idx); 1403*9371c9d4SSatish Balay Kokkos::single(Kokkos::PerThread(team), [=]() { 1404*9371c9d4SSatish Balay colkIdx() = st_idx; 1405*9371c9d4SSatish Balay L_ki() = *(ba_d + bi_d[myk] + st_idx); 1406*9371c9d4SSatish Balay }); 14078f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 140899551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 140999551766SMark Adams #endif 141099551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 14118f7e8f9dSMark Adams // U(i+1,:end) 14128f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 14138f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 14148f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 14158f7e8f9dSMark Adams if (col == myk) { 14168f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 14178f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 14188f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 14198f7e8f9dSMark Adams } else { 14208f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv = NULL; 14218f7e8f9dSMark Adams PetscInt high, low; 14228f7e8f9dSMark Adams const PetscInt *startj; 14238f7e8f9dSMark Adams if (col < myk) { // L 14248f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 14258f7e8f9dSMark Adams PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 14268f7e8f9dSMark Adams start = pLki + 1; // start at pLki+1, A22(myk,1) 14278f7e8f9dSMark Adams startj = bj_d + bi_d[myk] + idx; 14288f7e8f9dSMark Adams end = ba_d + bi_d[myk + 1]; 14298f7e8f9dSMark Adams } else { 14308f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk + 1] + 1; 14318f7e8f9dSMark Adams start = ba_d + idx; 14328f7e8f9dSMark Adams startj = bj_d + idx; 14338f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 14348f7e8f9dSMark Adams } 14358f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 14368f7e8f9dSMark Adams low = 0; 14378f7e8f9dSMark Adams high = (PetscInt)(end - start); 14388f7e8f9dSMark Adams while (high - low > 5) { 14398f7e8f9dSMark Adams int t = (low + high) / 2; 14408f7e8f9dSMark Adams if (startj[t] > col) high = t; 14418f7e8f9dSMark Adams else low = t; 14428f7e8f9dSMark Adams } 14438f7e8f9dSMark Adams for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 14448f7e8f9dSMark Adams if (startj[pAkjv - start] == col) break; 14458f7e8f9dSMark Adams } 14468f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 144799551766SMark Adams if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 144899551766SMark Adams #endif 14498f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 14508f7e8f9dSMark Adams } 14518f7e8f9dSMark Adams }); 14528f7e8f9dSMark Adams } 14538f7e8f9dSMark Adams }); 14548f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 14558f7e8f9dSMark Adams if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 14568f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 1457*9371c9d4SSatish Balay Kokkos::single(Kokkos::PerTeam(team), [=]() { 1458*9371c9d4SSatish Balay Kokkos::atomic_add(&d_flops_k(), flops()); 1459*9371c9d4SSatish Balay flops() = 0; 1460*9371c9d4SSatish Balay }); 14618f7e8f9dSMark Adams }); 14628f7e8f9dSMark Adams Kokkos::fence(); 14638f7e8f9dSMark Adams Kokkos::deep_copy(h_flops_k, d_flops_k); 14649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 1465*9371c9d4SSatish Balay Kokkos::parallel_for( 1466*9371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 14678f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 14688f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 14698f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 14708f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 14718f7e8f9dSMark Adams int i = start + outer_index * Ni + lg_rank % Ni; 14728f7e8f9dSMark Adams if (i < end) { 14738f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 14748f7e8f9dSMark Adams *pv = 1.0 / (*pv); 14758f7e8f9dSMark Adams } 14768f7e8f9dSMark Adams }); 14778f7e8f9dSMark Adams }); 14788f7e8f9dSMark Adams } 14799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 14809566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic_h)); 14819566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r_h)); 14828f7e8f9dSMark Adams 14839566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 14849566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &col_identity)); 14858f7e8f9dSMark Adams if (b->inode.size) { 14868f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 14878f7e8f9dSMark Adams } else if (row_identity && col_identity) { 14888f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 14898f7e8f9dSMark Adams } else { 14908f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 14918f7e8f9dSMark Adams } 14928f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 14939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 14948f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 14958f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 14968f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 14978f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 14988f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 14998f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 15008f7e8f9dSMark Adams 1501930e68a5SMark Adams PetscFunctionReturn(0); 1502930e68a5SMark Adams } 1503930e68a5SMark Adams 1504*9371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 1505930e68a5SMark Adams PetscFunctionBegin; 15069566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 150786a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 150886a27549SJunchao Zhang PetscFunctionReturn(0); 150986a27549SJunchao Zhang } 151086a27549SJunchao Zhang 1511*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) { 151286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 151386a27549SJunchao Zhang 151486a27549SJunchao Zhang PetscFunctionBegin; 151586a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 151686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 151786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 151886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 151986a27549SJunchao Zhang } 152086a27549SJunchao Zhang PetscFunctionReturn(0); 152186a27549SJunchao Zhang } 152286a27549SJunchao Zhang 152386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1524*9371c9d4SSatish Balay static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) { 152586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1526076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 152786a27549SJunchao Zhang 152886a27549SJunchao Zhang PetscFunctionBegin; 152986a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 153086a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1531076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 153286a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1533076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1534076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 153586a27549SJunchao Zhang 1536*9371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 153786a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 153886a27549SJunchao Zhang 153986a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 154086a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 154186a27549SJunchao Zhang */ 1542*9371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 154386a27549SJunchao Zhang 154486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 154586a27549SJunchao Zhang 154686a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1547076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 154886a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1549076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1550076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 155186a27549SJunchao Zhang 1552*9371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 155386a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 155486a27549SJunchao Zhang 155586a27549SJunchao Zhang /* Sort indices. See comments above */ 1556*9371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 155786a27549SJunchao Zhang 155886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 155986a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 156086a27549SJunchao Zhang } 156186a27549SJunchao Zhang PetscFunctionReturn(0); 156286a27549SJunchao Zhang } 156386a27549SJunchao Zhang 156486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1565*9371c9d4SSatish Balay static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) { 156686a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 156786a27549SJunchao Zhang PetscScalarKokkosView xv; 156886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 156986a27549SJunchao Zhang 157086a27549SJunchao Zhang PetscFunctionBegin; 15719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 15739566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 15749566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 157586a27549SJunchao Zhang /* Solve L tmpv = b */ 15769566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 157786a27549SJunchao Zhang /* Solve Ux = tmpv */ 15789566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 15799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 15809566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 15819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 158286a27549SJunchao Zhang PetscFunctionReturn(0); 158386a27549SJunchao Zhang } 158486a27549SJunchao Zhang 1585076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1586*9371c9d4SSatish Balay static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) { 158786a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 158886a27549SJunchao Zhang PetscScalarKokkosView xv; 158986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 159086a27549SJunchao Zhang 159186a27549SJunchao Zhang PetscFunctionBegin; 15929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 15949566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 15959566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 159686a27549SJunchao Zhang /* Solve U^T tmpv = b */ 159786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 159886a27549SJunchao Zhang 159986a27549SJunchao Zhang /* Solve L^T x = tmpv */ 160086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 16019566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 16029566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 16039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 160486a27549SJunchao Zhang PetscFunctionReturn(0); 160586a27549SJunchao Zhang } 160686a27549SJunchao Zhang 1607*9371c9d4SSatish Balay static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) { 160886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 160986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 161086a27549SJunchao Zhang PetscInt fill_lev = info->levels; 161186a27549SJunchao Zhang 161286a27549SJunchao Zhang PetscFunctionBegin; 16139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1615076ba34aSJunchao Zhang 1616076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1617076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1618076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1619076ba34aSJunchao Zhang 1620076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 162186a27549SJunchao Zhang 162286a27549SJunchao Zhang B->assembled = PETSC_TRUE; 162386a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 162486a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 162586a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 162686a27549SJunchao Zhang B->ops->matsolve = NULL; 162786a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 162886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162986a27549SJunchao Zhang 163086a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 163186a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 163286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1633eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 16349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 163586a27549SJunchao Zhang PetscFunctionReturn(0); 163686a27549SJunchao Zhang } 163786a27549SJunchao Zhang 1638*9371c9d4SSatish Balay static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 163986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 164086a27549SJunchao Zhang Mat_SeqAIJ *b; 164186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 164286a27549SJunchao Zhang PetscInt fill_lev = info->levels; 164386a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 164486a27549SJunchao Zhang PetscInt n = A->rmap->n; 164586a27549SJunchao Zhang 164686a27549SJunchao Zhang PetscFunctionBegin; 16479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 164886a27549SJunchao Zhang /* Rebuild factors */ 1649*9371c9d4SSatish Balay if (factors) { 1650*9371c9d4SSatish Balay factors->Destroy(); 1651*9371c9d4SSatish Balay } /* Destroy the old if it exists */ 1652*9371c9d4SSatish Balay else { 1653*9371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 1654*9371c9d4SSatish Balay } 165586a27549SJunchao Zhang 165686a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 165786a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 165886a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 165986a27549SJunchao Zhang 166086a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 166186a27549SJunchao Zhang 166286a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 166386a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 166486a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 166586a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 166686a27549SJunchao Zhang 166786a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1668076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1669076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1670076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 167186a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 167286a27549SJunchao Zhang 167386a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 167486a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 167586a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 167686a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 167786a27549SJunchao Zhang 167886a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 167986a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 168086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 168186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 168286a27549SJunchao Zhang #else 168386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 168486a27549SJunchao Zhang #endif 168586a27549SJunchao Zhang 168686a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 168786a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 168886a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 168986a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 169086a27549SJunchao Zhang 169186a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 16929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 169386a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 169486a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 169586a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 169686a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 169786a27549SJunchao Zhang 169886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 169986a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1700930e68a5SMark Adams PetscFunctionReturn(0); 1701930e68a5SMark Adams } 1702930e68a5SMark Adams 1703*9371c9d4SSatish Balay static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) { 17048f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 17058f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1706930e68a5SMark Adams 17078f7e8f9dSMark Adams PetscFunctionBegin; 17089566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 17098f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17108f7e8f9dSMark Adams // move B data into Kokkos 17119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 17129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 17138f7e8f9dSMark Adams { 17148f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1715300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17168f7e8f9dSMark Adams const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1717300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1718300d22a6SJunchao Zhang Kokkos::deep_copy(baijkok->diag_d, h_diag); 17198f7e8f9dSMark Adams } 17208f7e8f9dSMark Adams } 17218f7e8f9dSMark Adams PetscFunctionReturn(0); 17228f7e8f9dSMark Adams } 17238f7e8f9dSMark Adams 1724*9371c9d4SSatish Balay static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) { 1725930e68a5SMark Adams PetscFunctionBegin; 1726930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1727930e68a5SMark Adams PetscFunctionReturn(0); 1728930e68a5SMark Adams } 1729930e68a5SMark Adams 1730*9371c9d4SSatish Balay static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) { 17318f7e8f9dSMark Adams PetscFunctionBegin; 17328f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 17338f7e8f9dSMark Adams PetscFunctionReturn(0); 17348f7e8f9dSMark Adams } 17358f7e8f9dSMark Adams 1736930e68a5SMark Adams /*MC 173786a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 173886a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1739930e68a5SMark Adams 1740930e68a5SMark Adams Level: beginner 1741930e68a5SMark Adams 1742db781477SPatrick Sanan .seealso: `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1743930e68a5SMark Adams M*/ 174486a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1745930e68a5SMark Adams { 1746930e68a5SMark Adams PetscInt n = A->rmap->n; 1747930e68a5SMark Adams 1748930e68a5SMark Adams PetscFunctionBegin; 17499566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 17509566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1751930e68a5SMark Adams (*B)->factortype = ftype; 17529566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 17539566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1754930e68a5SMark Adams 17558f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17569566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 175786a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 175886a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 175986a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 17609566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 176186a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 176286a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 176398921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1764930e68a5SMark Adams 17659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 17669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 1767930e68a5SMark Adams PetscFunctionReturn(0); 1768930e68a5SMark Adams } 17698f7e8f9dSMark Adams 1770*9371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) { 17718f7e8f9dSMark Adams PetscInt n = A->rmap->n; 17728f7e8f9dSMark Adams 17738f7e8f9dSMark Adams PetscFunctionBegin; 17749566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 17759566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 17768f7e8f9dSMark Adams (*B)->factortype = ftype; 1777f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 17789566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 17799566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 17808f7e8f9dSMark Adams 17818f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17829566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 17838f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17848f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 17858f7e8f9dSMark Adams 17869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 17879566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 17888f7e8f9dSMark Adams PetscFunctionReturn(0); 17898f7e8f9dSMark Adams } 179086a27549SJunchao Zhang 1791*9371c9d4SSatish Balay PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) { 179286a27549SJunchao Zhang PetscFunctionBegin; 17939566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 17949566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 17959566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 179686a27549SJunchao Zhang PetscFunctionReturn(0); 179786a27549SJunchao Zhang } 179886a27549SJunchao Zhang 1799076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1800*9371c9d4SSatish Balay PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) { 1801076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1802076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1803076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1804076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1805076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1806076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1807076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1808076ba34aSJunchao Zhang 1809076ba34aSJunchao Zhang PetscFunctionBegin; 18109566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1811076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 18129566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 1813*9371c9d4SSatish Balay for (PetscInt p = i[k]; p < i[k + 1]; p++) { PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); } 18149566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1815076ba34aSJunchao Zhang } 1816076ba34aSJunchao Zhang PetscFunctionReturn(0); 1817076ba34aSJunchao Zhang } 1818