1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp> 211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 3c0c276a7Ssdargavi #include <petscmat_kokkos.hpp> 4076ba34aSJunchao Zhang #include <petscpkg_version.h> 5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 642550becSJunchao Zhang #include <petsc/private/sfimpl.h> 7aac854edSJunchao Zhang #include <petsc/private/kokkosimpl.hpp> 87233ce55SJed Brown #include <petscsys.h> 98c3ff71bSJunchao Zhang 108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 11f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 13cc6e31f1SJunchao Zhang 14cc6e31f1SJunchao Zhang // To suppress compiler warnings: 15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 21cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22cc6e31f1SJunchao Zhang 2386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 2486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 25076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 26076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 279d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp> 289d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp> 2986a27549SJunchao Zhang 3042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 318c3ff71bSJunchao Zhang 320e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 34f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 359371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 36f98996d3SJunchao Zhang #else 37f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 38f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 399371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 40f98996d3SJunchao Zhang #endif 41f98996d3SJunchao Zhang 42aac854edSJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 43aac854edSJunchao Zhang using KokkosSparse::spiluk_symbolic; 44aac854edSJunchao Zhang using KokkosSparse::spiluk_numeric; 45aac854edSJunchao Zhang using KokkosSparse::sptrsv_symbolic; 46aac854edSJunchao Zhang using KokkosSparse::sptrsv_solve; 47aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm; 48aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm; 49aac854edSJunchao Zhang #else 50aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_symbolic; 51aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_numeric; 52aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_symbolic; 53aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_solve; 54aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm; 55aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm; 56aac854edSJunchao Zhang #endif 57aac854edSJunchao Zhang 588c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 598c3ff71bSJunchao Zhang 60076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 61076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 62076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 63076ba34aSJunchao Zhang */ 64d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 65d71ae5a4SJacob Faibussowitsch { 66076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 67076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 688c3ff71bSJunchao Zhang 698c3ff71bSJunchao Zhang PetscFunctionBegin; 703ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 719566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 72076ba34aSJunchao Zhang 73076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 74076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 75076ba34aSJunchao Zhang 76076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 77076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 78076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 79076ba34aSJunchao Zhang */ 80076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 8193a54799SPierre Jolivet if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 82d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 83d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 84d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 85d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 86d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 87d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 88d1c799ffSJunchao Zhang aijseq->free_a = PETSC_TRUE; 89d1c799ffSJunchao Zhang aijseq->free_ij = PETSC_TRUE; 90d1c799ffSJunchao Zhang /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 91d1c799ffSJunchao Zhang we have the CsrMatrix on device first and then copy to host, followed by 92d1c799ffSJunchao Zhang MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 93d1c799ffSJunchao Zhang One could improve it by not using NULL garray. 94d1c799ffSJunchao Zhang */ 95d1c799ffSJunchao Zhang } 96076ba34aSJunchao Zhang delete aijkok; 97f4747e26SJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 98076ba34aSJunchao Zhang A->spptr = aijkok; 99f4747e26SJunchao Zhang } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 100f4747e26SJunchao Zhang MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 101f4747e26SJunchao Zhang auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 102f4747e26SJunchao Zhang aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 103076ba34aSJunchao Zhang } 1043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1058c3ff71bSJunchao Zhang } 1068c3ff71bSJunchao Zhang 10786a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 108d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 109d71ae5a4SJacob Faibussowitsch { 1108c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1118c3ff71bSJunchao Zhang 1128c3ff71bSJunchao Zhang PetscFunctionBegin; 113aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 1145f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 115076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 116f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 117580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 11886a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1198c3ff71bSJunchao Zhang } 1203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1218c3ff71bSJunchao Zhang } 1228c3ff71bSJunchao Zhang 123076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 124d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 125d71ae5a4SJacob Faibussowitsch { 12686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 12786a27549SJunchao Zhang 12886a27549SJunchao Zhang PetscFunctionBegin; 1295f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 13086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 13186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 13286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 13386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 1359566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 1363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13786a27549SJunchao Zhang } 13886a27549SJunchao Zhang 139d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 140d71ae5a4SJacob Faibussowitsch { 141f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1424df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 143f0cf5187SStefano Zampini 144f0cf5187SStefano Zampini PetscFunctionBegin; 145f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 14686a27549SJunchao Zhang /* We do not expect one needs factors on host */ 147aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1485f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 149f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 1503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151f0cf5187SStefano Zampini } 152f0cf5187SStefano Zampini 153d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 154d71ae5a4SJacob Faibussowitsch { 155076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156f0cf5187SStefano Zampini 157f0cf5187SStefano Zampini PetscFunctionBegin; 1585519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1595519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1605519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1615519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1625519a089SJose E. Roman */ 1635519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 164f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 165076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 166076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 167076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 168076ba34aSJunchao Zhang } 1693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 170076ba34aSJunchao Zhang } 171076ba34aSJunchao Zhang 172d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 173d71ae5a4SJacob Faibussowitsch { 174076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 175076ba34aSJunchao Zhang 176076ba34aSJunchao Zhang PetscFunctionBegin; 1775519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 179076ba34aSJunchao Zhang } 180076ba34aSJunchao Zhang 181d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 182d71ae5a4SJacob Faibussowitsch { 183076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 184076ba34aSJunchao Zhang 185076ba34aSJunchao Zhang PetscFunctionBegin; 1865519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 187f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 188076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1892328674fSJunchao Zhang } else { 1902328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1912328674fSJunchao Zhang } 1923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 193076ba34aSJunchao Zhang } 194076ba34aSJunchao Zhang 195d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 196d71ae5a4SJacob Faibussowitsch { 197076ba34aSJunchao Zhang PetscFunctionBegin; 198076ba34aSJunchao Zhang *array = NULL; 1993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 200076ba34aSJunchao Zhang } 201076ba34aSJunchao Zhang 202d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 203d71ae5a4SJacob Faibussowitsch { 204076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 205076ba34aSJunchao Zhang 206076ba34aSJunchao Zhang PetscFunctionBegin; 2075519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 208076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 2092328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 2102328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 2112328674fSJunchao Zhang } 2123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 213076ba34aSJunchao Zhang } 214076ba34aSJunchao Zhang 215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 216d71ae5a4SJacob Faibussowitsch { 217076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 218076ba34aSJunchao Zhang 219076ba34aSJunchao Zhang PetscFunctionBegin; 2205519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 221076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 222076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 2232328674fSJunchao Zhang } 2243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 225f0cf5187SStefano Zampini } 226f0cf5187SStefano Zampini 227d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 228d71ae5a4SJacob Faibussowitsch { 2297ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2307ee59b9bSJunchao Zhang 2317ee59b9bSJunchao Zhang PetscFunctionBegin; 2327ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 2337ee59b9bSJunchao Zhang 2347ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 2357ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 2367ee59b9bSJunchao Zhang if (a) { 237f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 2387ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 2397ee59b9bSJunchao Zhang } 2407ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 2413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2427ee59b9bSJunchao Zhang } 2437ee59b9bSJunchao Zhang 24403db1824SAlex Lindsay static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 24503db1824SAlex Lindsay { 24603db1824SAlex Lindsay PetscFunctionBegin; 24703db1824SAlex Lindsay *mtype = PETSC_MEMTYPE_KOKKOS; 24803db1824SAlex Lindsay PetscFunctionReturn(PETSC_SUCCESS); 24903db1824SAlex Lindsay } 25003db1824SAlex Lindsay 2510e3ece09SJunchao Zhang /* 2520e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2530e3ece09SJunchao Zhang 2540e3ece09SJunchao Zhang Input Parameter: 2550e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2560e3ece09SJunchao Zhang 2570e3ece09SJunchao Zhang Output Parameters: 2580e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 259aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2600e3ece09SJunchao Zhang */ 2610e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 262d71ae5a4SJacob Faibussowitsch { 2630e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2640e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2650e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2667b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2670e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2687b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2697b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2700e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2710e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2720e3ece09SJunchao Zhang PetscInt *offset; 273152b3e56SJunchao Zhang 274152b3e56SJunchao Zhang PetscFunctionBegin; 2750e3ece09SJunchao Zhang // Populate Ti 2760e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2770e3ece09SJunchao Zhang Ti++; 2780e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2790e3ece09SJunchao Zhang Ti--; 2800e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2810e3ece09SJunchao Zhang 2820e3ece09SJunchao Zhang // Populate Tj and the permutation array 2830e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2840e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2850e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2860e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2870e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2880e3ece09SJunchao Zhang 2890e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2900e3ece09SJunchao Zhang perm[disp] = j; 2910e3ece09SJunchao Zhang offset[r]++; 292076ba34aSJunchao Zhang } 2930e3ece09SJunchao Zhang } 2940e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2950e3ece09SJunchao Zhang 2960e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2970e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2980e3ece09SJunchao Zhang 2990e3ece09SJunchao Zhang // Output perm and T on device 3000e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 3010e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 3020e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 3030e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 3043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 305152b3e56SJunchao Zhang } 306152b3e56SJunchao Zhang 3070e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 3080e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 3090e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 310d71ae5a4SJacob Faibussowitsch { 3110e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3120e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3130e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3140e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 315152b3e56SJunchao Zhang 316152b3e56SJunchao Zhang PetscFunctionBegin; 3170e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 318f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 3190e3ece09SJunchao Zhang 3200e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3210e3ece09SJunchao Zhang 3220e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 3230e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 3240e3ece09SJunchao Zhang } else { 3250e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 3260e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3270e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 3280e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3290e3ece09SJunchao Zhang auto &Ta = T.values; 3300e3ece09SJunchao Zhang 331d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 332076ba34aSJunchao Zhang } 3330e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3340e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3350e3ece09SJunchao Zhang 3360e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3370e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 338d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 3390e3ece09SJunchao Zhang } 3400e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 3410e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 3420e3ece09SJunchao Zhang } 3430e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3440e3ece09SJunchao Zhang } 3450e3ece09SJunchao Zhang 3460e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3470e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3480e3ece09SJunchao Zhang { 3490e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3500e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3510e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3520e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3530e3ece09SJunchao Zhang 3540e3ece09SJunchao Zhang PetscFunctionBegin; 3550e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 356f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 3570e3ece09SJunchao Zhang 3580e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3590e3ece09SJunchao Zhang 3600e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3610e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3620e3ece09SJunchao Zhang } else { 3630e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3640e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3650e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3660e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3670e3ece09SJunchao Zhang auto &Ta = T.values; 3680e3ece09SJunchao Zhang 369d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3700e3ece09SJunchao Zhang } 3710e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3720e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3730e3ece09SJunchao Zhang 3740e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3750e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 376d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3770e3ece09SJunchao Zhang } 3780e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3790e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3800e3ece09SJunchao Zhang } 3813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 382152b3e56SJunchao Zhang } 383a587d139SMark 3848c3ff71bSJunchao Zhang /* y = A x */ 385d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 386d71ae5a4SJacob Faibussowitsch { 3878c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 388152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 389152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3908c3ff71bSJunchao Zhang 3918c3ff71bSJunchao Zhang PetscFunctionBegin; 3929566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3949566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3959566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3968c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 397d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3989566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3999566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 400076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 4019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4029566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4048c3ff71bSJunchao Zhang } 4058c3ff71bSJunchao Zhang 4068c3ff71bSJunchao Zhang /* y = A^T x */ 407d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 408d71ae5a4SJacob Faibussowitsch { 4098c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 410152b3e56SJunchao Zhang const char *mode; 411152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 412152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4130e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4148c3ff71bSJunchao Zhang 4158c3ff71bSJunchao Zhang PetscFunctionBegin; 4169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4189566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4199566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 420152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 422152b3e56SJunchao Zhang mode = "N"; 423152b3e56SJunchao Zhang } else { 424076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4250e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 426152b3e56SJunchao Zhang mode = "T"; 427152b3e56SJunchao Zhang } 428d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 4299566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4309566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4310e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4348c3ff71bSJunchao Zhang } 4358c3ff71bSJunchao Zhang 4368c3ff71bSJunchao Zhang /* y = A^H x */ 437d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 438d71ae5a4SJacob Faibussowitsch { 4398c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 440152b3e56SJunchao Zhang const char *mode; 441152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 442152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4430e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4448c3ff71bSJunchao Zhang 4458c3ff71bSJunchao Zhang PetscFunctionBegin; 4469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4489566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4499566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 450152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4519566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 452152b3e56SJunchao Zhang mode = "N"; 453152b3e56SJunchao Zhang } else { 454076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4550e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 456152b3e56SJunchao Zhang mode = "C"; 457152b3e56SJunchao Zhang } 458d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4599566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4609566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4610e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4629566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4648c3ff71bSJunchao Zhang } 4658c3ff71bSJunchao Zhang 4668c3ff71bSJunchao Zhang /* z = A x + y */ 467d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 468d71ae5a4SJacob Faibussowitsch { 4698c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 47092896123SJunchao Zhang ConstPetscScalarKokkosView xv; 471152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4728c3ff71bSJunchao Zhang 4738c3ff71bSJunchao Zhang PetscFunctionBegin; 4749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 47692896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 4779566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 47892896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 4798c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 480d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4819566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 48292896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4849566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4868c3ff71bSJunchao Zhang } 4878c3ff71bSJunchao Zhang 4888c3ff71bSJunchao Zhang /* z = A^T x + y */ 489d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 490d71ae5a4SJacob Faibussowitsch { 4918c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 492152b3e56SJunchao Zhang const char *mode; 49392896123SJunchao Zhang ConstPetscScalarKokkosView xv; 494152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4950e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4968c3ff71bSJunchao Zhang 4978c3ff71bSJunchao Zhang PetscFunctionBegin; 4989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 50092896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 5019566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 50292896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 503152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 505152b3e56SJunchao Zhang mode = "N"; 506152b3e56SJunchao Zhang } else { 507076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5080e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 509152b3e56SJunchao Zhang mode = "T"; 510152b3e56SJunchao Zhang } 511d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 5129566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 51392896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5140e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5178c3ff71bSJunchao Zhang } 5188c3ff71bSJunchao Zhang 5198c3ff71bSJunchao Zhang /* z = A^H x + y */ 520d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 521d71ae5a4SJacob Faibussowitsch { 5228c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 523152b3e56SJunchao Zhang const char *mode; 52492896123SJunchao Zhang ConstPetscScalarKokkosView xv; 525152b3e56SJunchao Zhang PetscScalarKokkosView zv; 5260e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 5278c3ff71bSJunchao Zhang 5288c3ff71bSJunchao Zhang PetscFunctionBegin; 5299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 53192896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 5329566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 53392896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 534152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 536152b3e56SJunchao Zhang mode = "N"; 537152b3e56SJunchao Zhang } else { 538076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5390e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 540152b3e56SJunchao Zhang mode = "C"; 541152b3e56SJunchao Zhang } 542d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5439566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 54492896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5450e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 548152b3e56SJunchao Zhang } 549152b3e56SJunchao Zhang 55066976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 551d71ae5a4SJacob Faibussowitsch { 552152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 553152b3e56SJunchao Zhang 554152b3e56SJunchao Zhang PetscFunctionBegin; 555152b3e56SJunchao Zhang switch (op) { 556152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 557152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5589566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 559152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 560152b3e56SJunchao Zhang break; 561d71ae5a4SJacob Faibussowitsch default: 562d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 563d71ae5a4SJacob Faibussowitsch break; 564152b3e56SJunchao Zhang } 5653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5668c3ff71bSJunchao Zhang } 5678c3ff71bSJunchao Zhang 568076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 569d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 570d71ae5a4SJacob Faibussowitsch { 571076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5728c3ff71bSJunchao Zhang 5738c3ff71bSJunchao Zhang PetscFunctionBegin; 5749566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 575076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 57651ece73cSJunchao Zhang PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 57751ece73cSJunchao Zhang PetscCall(MatSetType(*newmat, mtype)); 5788c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5799566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 580076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5815f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5829566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5839566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5849566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5859566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 586076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 587394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5885f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 589f4747e26SJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 5908c3ff71bSJunchao Zhang } 591076ba34aSJunchao Zhang } 5923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5938c3ff71bSJunchao Zhang } 5948c3ff71bSJunchao Zhang 595076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 596076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 597076ba34aSJunchao Zhang */ 598d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 599d71ae5a4SJacob Faibussowitsch { 600076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 601076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 602076ba34aSJunchao Zhang Mat mat; 6038c3ff71bSJunchao Zhang 6048c3ff71bSJunchao Zhang PetscFunctionBegin; 605076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 6069566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 607076ba34aSJunchao Zhang mat = *B; 608f4747e26SJunchao Zhang if (A->assembled) { 609076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 610f4747e26SJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 611076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 612076ba34aSJunchao Zhang /* Now copy values to B if needed */ 613076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 614076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 615076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 616076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 617076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 618076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 619076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 620076ba34aSJunchao Zhang } 621076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 622076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 623076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 624076ba34aSJunchao Zhang } 625076ba34aSJunchao Zhang mat->spptr = bkok; 626076ba34aSJunchao Zhang } 627076ba34aSJunchao Zhang 6289566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 6299566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 6309566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 6319566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 6323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6338c3ff71bSJunchao Zhang } 6348c3ff71bSJunchao Zhang 635d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 636d71ae5a4SJacob Faibussowitsch { 6370ecb592aSJunchao Zhang Mat At; 6380e3ece09SJunchao Zhang KokkosCsrMatrix internT; 6390ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 6400ecb592aSJunchao Zhang 6410ecb592aSJunchao Zhang PetscFunctionBegin; 6427fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 6439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6440ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 645ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6460e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6479566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6480ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6499566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6500ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6510ecb592aSJunchao Zhang if ((*B)->assembled) { 6520ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6530e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6550ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6560ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6570e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6580e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6590e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6600e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6610ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6620ecb592aSJunchao Zhang } 6633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6640ecb592aSJunchao Zhang } 6650ecb592aSJunchao Zhang 666d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 667d71ae5a4SJacob Faibussowitsch { 66886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6698c3ff71bSJunchao Zhang 6708c3ff71bSJunchao Zhang PetscFunctionBegin; 67186a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 67286a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6738c3ff71bSJunchao Zhang delete aijkok; 67486a27549SJunchao Zhang } else { 67586a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 67686a27549SJunchao Zhang } 677cbc6b225SStefano Zampini A->spptr = NULL; 6789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6809566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 68157761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 68257761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 68357761e9aSJunchao Zhang #endif 6849566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6868c3ff71bSJunchao Zhang } 6878c3ff71bSJunchao Zhang 6883f3ba80aSJunchao Zhang /*MC 6893f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6903f3ba80aSJunchao Zhang 69115229ffcSPierre Jolivet A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 6923f3ba80aSJunchao Zhang 6932ef1f0ffSBarry Smith Options Database Key: 69411a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6953f3ba80aSJunchao Zhang 6963f3ba80aSJunchao Zhang Level: beginner 6973f3ba80aSJunchao Zhang 6981cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6993f3ba80aSJunchao Zhang M*/ 700d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 701d71ae5a4SJacob Faibussowitsch { 70286a27549SJunchao Zhang PetscFunctionBegin; 7039566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 7049566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 7059566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 7063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 70786a27549SJunchao Zhang } 70886a27549SJunchao Zhang 709076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 710d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 711d71ae5a4SJacob Faibussowitsch { 712076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 713076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 714076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 715076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 716076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 717076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 718a3f881fbSStefano Zampini 719a3f881fbSStefano Zampini PetscFunctionBegin; 720076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 721076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 7224f572ea9SToby Isaac PetscAssertPointer(C, 4); 723076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 724076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 7255f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 7265f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 727076ba34aSJunchao Zhang 7289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 730076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 731076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 732076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 733076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 734076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 735076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 736076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 737076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 738076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 739076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 740076ba34aSJunchao Zhang nnz = a->nz + b->nz; 741076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 742076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 743076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 744076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 745ecd797f4SJunchao Zhang auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 746ecd797f4SJunchao Zhang auto ci = MatRowMapKokkosView("i", ai.extent(0)); 747ecd797f4SJunchao Zhang auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 748076ba34aSJunchao Zhang 749076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7509371c9d4SSatish Balay Kokkos::parallel_for( 751d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 753076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754076ba34aSJunchao Zhang 755076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 756076ba34aSJunchao Zhang ci(i) = coffset; 757076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 758076ba34aSJunchao Zhang }); 759076ba34aSJunchao Zhang 760076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 761076ba34aSJunchao Zhang if (k < alen) { 762076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 763076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 764076ba34aSJunchao Zhang } else { 765076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 766076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 767076ba34aSJunchao Zhang } 768076ba34aSJunchao Zhang }); 769076ba34aSJunchao Zhang }); 770ecd797f4SJunchao Zhang PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 7719566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 772076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 773076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 774076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 775076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 776076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 777076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 778076ba34aSJunchao Zhang 7799371c9d4SSatish Balay Kokkos::parallel_for( 780d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 781076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 782076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 783076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 784076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 785076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 786076ba34aSJunchao Zhang }); 787076ba34aSJunchao Zhang }); 7889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 789076ba34aSJunchao Zhang } 7903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 791076ba34aSJunchao Zhang } 792076ba34aSJunchao Zhang 793d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 794d71ae5a4SJacob Faibussowitsch { 795076ba34aSJunchao Zhang PetscFunctionBegin; 796076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 798a3f881fbSStefano Zampini } 799a3f881fbSStefano Zampini 800d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 801d71ae5a4SJacob Faibussowitsch { 802a3f881fbSStefano Zampini Mat_Product *product = C->product; 803a3f881fbSStefano Zampini Mat A, B; 804076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 805a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 806a3f881fbSStefano Zampini Mat_SeqAIJ *c; 807076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 8080e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 809a3f881fbSStefano Zampini 810a3f881fbSStefano Zampini PetscFunctionBegin; 811a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8125f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 813076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 814076ba34aSJunchao Zhang 8150e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 8160e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 8170e3ece09SJunchao Zhang // we still do numeric. 8180e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 8190e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 8203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 821076ba34aSJunchao Zhang } 822076ba34aSJunchao Zhang 823076ba34aSJunchao Zhang switch (product->type) { 8249371c9d4SSatish Balay case MATPRODUCT_AB: 8259371c9d4SSatish Balay transA = false; 8269371c9d4SSatish Balay transB = false; 8279371c9d4SSatish Balay break; 8289371c9d4SSatish Balay case MATPRODUCT_AtB: 8299371c9d4SSatish Balay transA = true; 8309371c9d4SSatish Balay transB = false; 8319371c9d4SSatish Balay break; 8329371c9d4SSatish Balay case MATPRODUCT_ABt: 8339371c9d4SSatish Balay transA = false; 8349371c9d4SSatish Balay transB = true; 8359371c9d4SSatish Balay break; 836d71ae5a4SJacob Faibussowitsch default: 837d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 838076ba34aSJunchao Zhang } 839076ba34aSJunchao Zhang 840a3f881fbSStefano Zampini A = product->A; 841a3f881fbSStefano Zampini B = product->B; 8429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 844a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 845a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 846a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 847076ba34aSJunchao Zhang 8485f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 849076ba34aSJunchao Zhang 8500e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8510e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 852076ba34aSJunchao Zhang 853076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 854076ba34aSJunchao Zhang if (transA) { 8559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 856076ba34aSJunchao Zhang transA = false; 857a3f881fbSStefano Zampini } 858a3f881fbSStefano Zampini 859076ba34aSJunchao Zhang if (transB) { 8609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 861076ba34aSJunchao Zhang transB = false; 862076ba34aSJunchao Zhang } 8639566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8640e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8650e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 866866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 867866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 868e944a159SJunchao Zhang #endif 869866eb059SJunchao Zhang 8709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 872a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 873a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8749566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8759566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8769566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 877a3f881fbSStefano Zampini c->reallocs = 0; 878076ba34aSJunchao Zhang C->info.mallocs = 0; 879a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 880a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 881a3f881fbSStefano Zampini C->num_ass++; 8823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 883a3f881fbSStefano Zampini } 884a3f881fbSStefano Zampini 885d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 886d71ae5a4SJacob Faibussowitsch { 887076ba34aSJunchao Zhang Mat_Product *product = C->product; 888076ba34aSJunchao Zhang MatProductType ptype; 889076ba34aSJunchao Zhang Mat A, B; 890076ba34aSJunchao Zhang bool transA, transB; 891076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 892076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 893076ba34aSJunchao Zhang MPI_Comm comm; 8940e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 895a3f881fbSStefano Zampini 896a3f881fbSStefano Zampini PetscFunctionBegin; 897a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8989566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8995f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 900a3f881fbSStefano Zampini A = product->A; 901a3f881fbSStefano Zampini B = product->B; 9029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 9039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 904a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 905a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 9060e3ece09SJunchao Zhang csrmatA = akok->csrmat; 9070e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 908076ba34aSJunchao Zhang 909a3f881fbSStefano Zampini ptype = product->type; 9100e3ece09SJunchao Zhang // Take advantage of the symmetry if true 9110e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 9120e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9130e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 9140e3ece09SJunchao Zhang } 9150e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 9160e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9170e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 9180e3ece09SJunchao Zhang } 9190e3ece09SJunchao Zhang 920a3f881fbSStefano Zampini switch (ptype) { 9219371c9d4SSatish Balay case MATPRODUCT_AB: 9229371c9d4SSatish Balay transA = false; 9239371c9d4SSatish Balay transB = false; 9240e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 9259371c9d4SSatish Balay break; 9269371c9d4SSatish Balay case MATPRODUCT_AtB: 9279371c9d4SSatish Balay transA = true; 9289371c9d4SSatish Balay transB = false; 9290e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 9300e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 9319371c9d4SSatish Balay break; 9329371c9d4SSatish Balay case MATPRODUCT_ABt: 9339371c9d4SSatish Balay transA = false; 9349371c9d4SSatish Balay transB = true; 9350e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 9360e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 9379371c9d4SSatish Balay break; 938d71ae5a4SJacob Faibussowitsch default: 939d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 940a3f881fbSStefano Zampini } 9410e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 942076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 943a3f881fbSStefano Zampini 944076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 945866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 946866eb059SJunchao Zhang 947866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 948866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 949866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 950866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 951866eb059SJunchao Zhang #endif 952866eb059SJunchao Zhang #endif 9530e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 954076ba34aSJunchao Zhang 9559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 956076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 957076ba34aSJunchao Zhang if (transA) { 9589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 959076ba34aSJunchao Zhang transA = false; 960076ba34aSJunchao Zhang } 961076ba34aSJunchao Zhang 962076ba34aSJunchao Zhang if (transB) { 9639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 964076ba34aSJunchao Zhang transB = false; 965076ba34aSJunchao Zhang } 966076ba34aSJunchao Zhang 9670e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 968076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 969076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 970076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 971076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 972076ba34aSJunchao Zhang */ 9730e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9740e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 975866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 976866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 977866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 978e944a159SJunchao Zhang #endif 9799566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 980076ba34aSJunchao Zhang 9819566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9829566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 983076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 985a3f881fbSStefano Zampini } 986a3f881fbSStefano Zampini 987a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 988d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 989d71ae5a4SJacob Faibussowitsch { 990076ba34aSJunchao Zhang Mat_Product *product = mat->product; 991a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 992a3f881fbSStefano Zampini 993a3f881fbSStefano Zampini PetscFunctionBegin; 994a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9959566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 99648a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 997a3f881fbSStefano Zampini if (Biskok && Ciskok) { 998a3f881fbSStefano Zampini switch (product->type) { 999a3f881fbSStefano Zampini case MATPRODUCT_AB: 1000a3f881fbSStefano Zampini case MATPRODUCT_AtB: 1001d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 1002d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1003d71ae5a4SJacob Faibussowitsch break; 1004a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 1005a3f881fbSStefano Zampini case MATPRODUCT_RARt: 1006d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 1007d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1008d71ae5a4SJacob Faibussowitsch break; 1009d71ae5a4SJacob Faibussowitsch default: 1010d71ae5a4SJacob Faibussowitsch break; 1011a3f881fbSStefano Zampini } 1012a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 10139566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1014a3f881fbSStefano Zampini } 10153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1016a3f881fbSStefano Zampini } 1017a587d139SMark 1018d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1019d71ae5a4SJacob Faibussowitsch { 1020f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 1021f0cf5187SStefano Zampini 1022f0cf5187SStefano Zampini PetscFunctionBegin; 10239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 10249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1025f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1026d326c3f1SJunchao Zhang KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 10279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 10299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1031f0cf5187SStefano Zampini } 1032f0cf5187SStefano Zampini 1033f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1034f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1035f4747e26SJunchao Zhang { 1036f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1037f4747e26SJunchao Zhang 1038f4747e26SJunchao Zhang PetscFunctionBegin; 1039f4747e26SJunchao Zhang if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1040f4747e26SJunchao Zhang PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1041f4747e26SJunchao Zhang 1042f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1043f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1046f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1047d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1048f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1049f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1050f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1051f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1052f4747e26SJunchao Zhang PetscCall(MatShift_Basic(A, a)); 1053f4747e26SJunchao Zhang } 1054f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1055f4747e26SJunchao Zhang } 1056f4747e26SJunchao Zhang 1057f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1058f4747e26SJunchao Zhang { 1059f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1060f4747e26SJunchao Zhang 1061f4747e26SJunchao Zhang PetscFunctionBegin; 1062f4747e26SJunchao Zhang if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1063f4747e26SJunchao Zhang ConstPetscScalarKokkosView dv; 1064f4747e26SJunchao Zhang PetscInt n, nv; 1065f4747e26SJunchao Zhang 1066f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1067f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1068f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(D, &dv)); 1069f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(D, &nv)); 1070f4747e26SJunchao Zhang n = PetscMin(Y->rmap->n, Y->cmap->n); 1071f4747e26SJunchao Zhang PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1072f4747e26SJunchao Zhang 1073f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1074f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1075f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1076f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1077d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1078f4747e26SJunchao Zhang if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1079f4747e26SJunchao Zhang else Aa(Adiag(i)) += dv(i); 1080f4747e26SJunchao Zhang })); 1081f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(D, &dv)); 1082f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1083f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1084f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1085f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1086f4747e26SJunchao Zhang PetscCall(MatDiagonalSet_Default(Y, D, is)); 1087f4747e26SJunchao Zhang } 1088f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1089f4747e26SJunchao Zhang } 1090f4747e26SJunchao Zhang 1091f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1092f4747e26SJunchao Zhang { 1093f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1094f4747e26SJunchao Zhang PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1095f4747e26SJunchao Zhang ConstPetscScalarKokkosView lv, rv; 1096f4747e26SJunchao Zhang 1097f4747e26SJunchao Zhang PetscFunctionBegin; 1098f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1099f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1100f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1101f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1102f4747e26SJunchao Zhang const auto &Ai = aijkok->i_dual.view_device(); 1103f4747e26SJunchao Zhang const auto &Aj = aijkok->j_dual.view_device(); 1104f4747e26SJunchao Zhang if (ll) { 1105f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(ll, &m)); 1106f4747e26SJunchao Zhang PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1107f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(ll, &lv)); 1108f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each row 1109d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1110f4747e26SJunchao Zhang PetscInt i = t.league_rank(); // row i 1111f4747e26SJunchao Zhang PetscInt len = Ai(i + 1) - Ai(i); 1112f4747e26SJunchao Zhang // scale entries on the row 1113f4747e26SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1114f4747e26SJunchao Zhang })); 1115f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(ll, &lv)); 1116f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1117f4747e26SJunchao Zhang } 1118f4747e26SJunchao Zhang if (rr) { 1119f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(rr, &n)); 1120f4747e26SJunchao Zhang PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1121f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(rr, &rv)); 1122f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1123d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1124f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(rr, &lv)); 1125f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1126f4747e26SJunchao Zhang } 1127f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1128f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1129f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1130f4747e26SJunchao Zhang } 1131f4747e26SJunchao Zhang 1132d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1133d71ae5a4SJacob Faibussowitsch { 1134076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1135a587d139SMark 1136a587d139SMark PetscFunctionBegin; 1137076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11382328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1139d326c3f1SJunchao Zhang KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 11409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11412328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 11429566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 11432328674fSJunchao Zhang } 11443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1145a587d139SMark } 1146a587d139SMark 1147d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1148d71ae5a4SJacob Faibussowitsch { 1149f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1150f78ce678SMark Adams PetscInt n; 1151f78ce678SMark Adams PetscScalarKokkosView xv; 1152f78ce678SMark Adams 1153f78ce678SMark Adams PetscFunctionBegin; 1154f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1155f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1156f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1157f78ce678SMark Adams 1158f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1159f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1160f78ce678SMark Adams 1161f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1162f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1163f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1164f78ce678SMark Adams 1165f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 11669371c9d4SSatish Balay Kokkos::parallel_for( 1167d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1168f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1169f78ce678SMark Adams else xv(i) = 0; 1170f78ce678SMark Adams }); 1171f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 11723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1173f78ce678SMark Adams } 1174f78ce678SMark Adams 1175db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1176d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1177d71ae5a4SJacob Faibussowitsch { 1178db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1179db78de30SJunchao Zhang 1180db78de30SJunchao Zhang PetscFunctionBegin; 1181db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11824f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1183db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1185db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1186076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1188db78de30SJunchao Zhang } 1189db78de30SJunchao Zhang 1190d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1191d71ae5a4SJacob Faibussowitsch { 1192db78de30SJunchao Zhang PetscFunctionBegin; 1193db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11944f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1195db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1197db78de30SJunchao Zhang } 1198db78de30SJunchao Zhang 1199d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1200d71ae5a4SJacob Faibussowitsch { 1201db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1202db78de30SJunchao Zhang 1203db78de30SJunchao Zhang PetscFunctionBegin; 1204db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12054f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1206db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1208db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1209076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1211db78de30SJunchao Zhang } 1212db78de30SJunchao Zhang 1213d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1214d71ae5a4SJacob Faibussowitsch { 1215db78de30SJunchao Zhang PetscFunctionBegin; 1216db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12174f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1218db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1221db78de30SJunchao Zhang } 1222db78de30SJunchao Zhang 1223d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1224d71ae5a4SJacob Faibussowitsch { 1225db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1226db78de30SJunchao Zhang 1227db78de30SJunchao Zhang PetscFunctionBegin; 1228db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12294f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1230db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1231db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1232076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1234db78de30SJunchao Zhang } 1235db78de30SJunchao Zhang 1236d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1237d71ae5a4SJacob Faibussowitsch { 1238db78de30SJunchao Zhang PetscFunctionBegin; 1239db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12404f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1241db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1244db78de30SJunchao Zhang } 1245db78de30SJunchao Zhang 1246c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1247c0c276a7Ssdargavi { 1248c0c276a7Ssdargavi Mat_SeqAIJKokkos *akok; 1249c0c276a7Ssdargavi 1250c0c276a7Ssdargavi PetscFunctionBegin; 1251ecd797f4SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1252c0c276a7Ssdargavi PetscCall(MatCreate(comm, A)); 1253c0c276a7Ssdargavi PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1254c0c276a7Ssdargavi PetscFunctionReturn(PETSC_SUCCESS); 1255c0c276a7Ssdargavi } 1256c0c276a7Ssdargavi 1257c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1258d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1259d71ae5a4SJacob Faibussowitsch { 1260a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1261c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1262c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1263c17cf699SJunchao Zhang MatScalarKokkosView Ya; 12644df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1265a587d139SMark 1266a587d139SMark PetscFunctionBegin; 1267c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1268c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 12699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 12709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 12719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1272db78de30SJunchao Zhang 1273c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1274a587d139SMark PetscBool e; 12759566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1276a587d139SMark if (e) { 12779566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1278c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1279a587d139SMark } 1280a587d139SMark } 1281db78de30SJunchao Zhang 1282c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1283c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1284c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1285c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1286c17cf699SJunchao Zhang */ 1287c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1288c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1289c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1290c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1291c17cf699SJunchao Zhang 1292c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1293d326c3f1SJunchao Zhang KokkosBlas::axpy(exec, alpha, Xa, Ya); 12949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1295c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1296c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1297c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1298c17cf699SJunchao Zhang 12999371c9d4SSatish Balay Kokkos::parallel_for( 1300d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 13010e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 13020e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 13030e3ece09SJunchao Zhang // Only one thread works in a team 1304c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 13050e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 13060e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 13070e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1308c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1309c17cf699SJunchao Zhang q++; 1310a587d139SMark } else { 13110e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 13120e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 13130e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 13140e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 13158b8b16f9SJunchao Zhang #else 13160e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 13178b8b16f9SJunchao Zhang #endif 1318a587d139SMark } 1319c17cf699SJunchao Zhang } 1320c17cf699SJunchao Zhang }); 1321c17cf699SJunchao Zhang }); 13229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 13230e3ece09SJunchao Zhang } else { // different nonzero patterns 1324c17cf699SJunchao Zhang Mat Z; 1325c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1326c17cf699SJunchao Zhang KernelHandle kh; 13270e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1328c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1329c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1330c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 13319566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 13329566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1333c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1334c17cf699SJunchao Zhang } 13359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13360e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 13373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1338a587d139SMark } 1339a587d139SMark 13402c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 13412c4ab24aSJunchao Zhang PetscCount n; 13422c4ab24aSJunchao Zhang PetscCount Atot; 13432c4ab24aSJunchao Zhang PetscInt nz; 13442c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 13452c4ab24aSJunchao Zhang PetscCountKokkosView perm; 13462c4ab24aSJunchao Zhang 13472c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 13482c4ab24aSJunchao Zhang { 13492c4ab24aSJunchao Zhang nz = coo_h->nz; 13502c4ab24aSJunchao Zhang n = coo_h->n; 13512c4ab24aSJunchao Zhang Atot = coo_h->Atot; 13522c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 13532c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 13542c4ab24aSJunchao Zhang } 13552c4ab24aSJunchao Zhang }; 13562c4ab24aSJunchao Zhang 135749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 13582c4ab24aSJunchao Zhang { 13592c4ab24aSJunchao Zhang PetscFunctionBegin; 136049abdd8aSBarry Smith PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 13612c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13622c4ab24aSJunchao Zhang } 13632c4ab24aSJunchao Zhang 1364d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1365d71ae5a4SJacob Faibussowitsch { 136642550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 136742550becSJunchao Zhang Mat_SeqAIJ *aseq; 136803e76207SPierre Jolivet PetscContainer container_h; 13692c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 13702c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 137142550becSJunchao Zhang 137242550becSJunchao Zhang PetscFunctionBegin; 13739566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1374394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 137542550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1376cbc6b225SStefano Zampini delete akok; 1377f4747e26SJunchao Zhang mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 13789566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 13792c4ab24aSJunchao Zhang 13802c4ab24aSJunchao Zhang // Copy the COO struct to device 13812c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 13822c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 13832c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 13842c4ab24aSJunchao Zhang 13852c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 138603e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 13873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138842550becSJunchao Zhang } 138942550becSJunchao Zhang 1390d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1391d71ae5a4SJacob Faibussowitsch { 139242550becSJunchao Zhang MatScalarKokkosView Aa; 139342550becSJunchao Zhang ConstMatScalarKokkosView kv; 139442550becSJunchao Zhang PetscMemType memtype; 13952c4ab24aSJunchao Zhang PetscContainer container; 13962c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 139742550becSJunchao Zhang 139842550becSJunchao Zhang PetscFunctionBegin; 13992c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 14002c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 14012c4ab24aSJunchao Zhang 14022c4ab24aSJunchao Zhang const auto &n = coo->n; 14032c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 14042c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 14052c4ab24aSJunchao Zhang const auto &perm = coo->perm; 14062c4ab24aSJunchao Zhang 14079566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 140842550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 14092c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 141042550becSJunchao Zhang } else { 14112c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 141242550becSJunchao Zhang } 141342550becSJunchao Zhang 1414c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1415c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 141642550becSJunchao Zhang 141708bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 14189371c9d4SSatish Balay Kokkos::parallel_for( 1419d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1420c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1421c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1422c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1423c7b718f4SJunchao Zhang }); 142408bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1425394ed5ebSJunchao Zhang 14269566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 14279566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 14283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 142942550becSJunchao Zhang } 143042550becSJunchao Zhang 1431d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1432d71ae5a4SJacob Faibussowitsch { 1433076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1434076ba34aSJunchao Zhang 14358c3ff71bSJunchao Zhang PetscFunctionBegin; 1436076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 14376f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 14386f3d89d0SStefano Zampini 14398c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 14408c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 14418c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1442a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1443f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1444a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1445076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 14468c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 14478c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 14488c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 14498c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 14508c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 14518c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1452076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 14530ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1454152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1455f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1456f4747e26SJunchao Zhang A->ops->shift = MatShift_SeqAIJKokkos; 1457f4747e26SJunchao Zhang A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1458f4747e26SJunchao Zhang A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 145903db1824SAlex Lindsay A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1460076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1461076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1462076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1463076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1464076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1465076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 14667ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 146742550becSJunchao Zhang 14689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 14699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 147057761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 147157761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 147257761e9aSJunchao Zhang #endif 14733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1474076ba34aSJunchao Zhang } 1475076ba34aSJunchao Zhang 14769d13fa56SJunchao Zhang /* 14779d13fa56SJunchao Zhang Extract the (prescribled) diagonal blocks of the matrix and then invert them 14789d13fa56SJunchao Zhang 14799d13fa56SJunchao Zhang Input Parameters: 14809d13fa56SJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 14819d13fa56SJunchao Zhang . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 14829d13fa56SJunchao Zhang . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 14839d13fa56SJunchao Zhang . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 14849d13fa56SJunchao Zhang - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 14859d13fa56SJunchao Zhang 14869d13fa56SJunchao Zhang Output Parameter: 14879d13fa56SJunchao Zhang . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 14889d13fa56SJunchao Zhang */ 14899d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 14909d13fa56SJunchao Zhang { 14919d13fa56SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 14929d13fa56SJunchao Zhang PetscInt nblocks = bs.extent(0) - 1; 14939d13fa56SJunchao Zhang 14949d13fa56SJunchao Zhang PetscFunctionBegin; 14959d13fa56SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 14969d13fa56SJunchao Zhang 14979d13fa56SJunchao Zhang // Pull out the diagonal blocks of the matrix and then invert the blocks 14989d13fa56SJunchao Zhang auto Aa = akok->a_dual.view_device(); 14999d13fa56SJunchao Zhang auto Ai = akok->i_dual.view_device(); 15009d13fa56SJunchao Zhang auto Aj = akok->j_dual.view_device(); 15019d13fa56SJunchao Zhang auto Adiag = akok->diag_dual.view_device(); 15029d13fa56SJunchao Zhang // TODO: how to tune the team size? 150345402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 15049d13fa56SJunchao Zhang auto ts = Kokkos::AUTO(); 15059d13fa56SJunchao Zhang #else 15069d13fa56SJunchao Zhang auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 15079d13fa56SJunchao Zhang #endif 15089d13fa56SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1509d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 15109d13fa56SJunchao Zhang const PetscInt bid = teamMember.league_rank(); // block id 15119d13fa56SJunchao Zhang const PetscInt rstart = bs(bid); // this block starts from this row 15129d13fa56SJunchao Zhang const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 15139d13fa56SJunchao Zhang const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 15149d13fa56SJunchao Zhang const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 15159d13fa56SJunchao Zhang 15169d13fa56SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 15179d13fa56SJunchao Zhang PetscInt i = rstart + r; // i-th row in A 15189d13fa56SJunchao Zhang 15199d13fa56SJunchao Zhang if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 15209d13fa56SJunchao Zhang PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 15219d13fa56SJunchao Zhang 15229d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 15239d13fa56SJunchao Zhang if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 15249d13fa56SJunchao Zhang B(r, c) = 0.0; 15259d13fa56SJunchao Zhang } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 15269d13fa56SJunchao Zhang B(r, c) = Aa(first + c); 15279d13fa56SJunchao Zhang } else { // this entry does not show up in the CSR 15289d13fa56SJunchao Zhang B(r, c) = 0.0; 15299d13fa56SJunchao Zhang } 15309d13fa56SJunchao Zhang } 15319d13fa56SJunchao Zhang } else { // rare case that the diagonal does not exist 15329d13fa56SJunchao Zhang const PetscInt begin = Ai(i); 15339d13fa56SJunchao Zhang const PetscInt end = Ai(i + 1); 15349d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 15359d13fa56SJunchao Zhang for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 15369d13fa56SJunchao Zhang if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 15379d13fa56SJunchao Zhang else if (Aj(j) >= rstart + m) break; 15389d13fa56SJunchao Zhang } 15399d13fa56SJunchao Zhang } 15409d13fa56SJunchao Zhang }); 15419d13fa56SJunchao Zhang 15429d13fa56SJunchao Zhang // LU-decompose B (w/o pivoting) and then invert B 15439d13fa56SJunchao Zhang KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 15449d13fa56SJunchao Zhang KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 15459d13fa56SJunchao Zhang })); 15469d13fa56SJunchao Zhang // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 15479d13fa56SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15489d13fa56SJunchao Zhang } 15499d13fa56SJunchao Zhang 1550d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1551d71ae5a4SJacob Faibussowitsch { 1552076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1553076ba34aSJunchao Zhang PetscInt i, m, n; 15544df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1555076ba34aSJunchao Zhang 1556076ba34aSJunchao Zhang PetscFunctionBegin; 15575f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1558076ba34aSJunchao Zhang 1559076ba34aSJunchao Zhang m = akok->nrows(); 1560076ba34aSJunchao Zhang n = akok->ncols(); 15619566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 15629566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1563076ba34aSJunchao Zhang 1564076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 15659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 156657508eceSPierre Jolivet aseq = (Mat_SeqAIJ *)A->data; 1567076ba34aSJunchao Zhang 1568f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1569f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1570076ba34aSJunchao Zhang 1571076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1572076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1573076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1574076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1575076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1576076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1577076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1578076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1579076ba34aSJunchao Zhang 15809566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 15819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1582ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1583076ba34aSJunchao Zhang 1584076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1585076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1586ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 15879566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 15889566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 15893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1590076ba34aSJunchao Zhang } 1591076ba34aSJunchao Zhang 15920e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 15930e3ece09SJunchao Zhang { 15940e3ece09SJunchao Zhang PetscFunctionBegin; 15950e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 15960e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 15970e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15980e3ece09SJunchao Zhang } 15990e3ece09SJunchao Zhang 16000e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 16010e3ece09SJunchao Zhang { 16020e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 16034d86920dSPierre Jolivet 16040e3ece09SJunchao Zhang PetscFunctionBegin; 16050e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 16060e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 16070e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16080e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 16090e3ece09SJunchao Zhang } 16100e3ece09SJunchao Zhang 1611076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1612076ba34aSJunchao Zhang 1613076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1614076ba34aSJunchao Zhang */ 1615d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1616d71ae5a4SJacob Faibussowitsch { 1617076ba34aSJunchao Zhang PetscFunctionBegin; 16189566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16199566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16218c3ff71bSJunchao Zhang } 16228c3ff71bSJunchao Zhang 1623152b3e56SJunchao Zhang /*@C 162411a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 16258c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 162620f4b53cSBarry Smith Kokkos for calculations. 16278c3ff71bSJunchao Zhang 16288c3ff71bSJunchao Zhang Collective 16298c3ff71bSJunchao Zhang 16308c3ff71bSJunchao Zhang Input Parameters: 163111a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 16328c3ff71bSJunchao Zhang . m - number of rows 16338c3ff71bSJunchao Zhang . n - number of columns 163420f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 163520f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 16368c3ff71bSJunchao Zhang 16378c3ff71bSJunchao Zhang Output Parameter: 16388c3ff71bSJunchao Zhang . A - the matrix 16398c3ff71bSJunchao Zhang 16402ef1f0ffSBarry Smith Level: intermediate 16412ef1f0ffSBarry Smith 16422ef1f0ffSBarry Smith Notes: 164311a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 16448c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 164511a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 16468c3ff71bSJunchao Zhang 164711a5261eSBarry Smith The AIJ format, also called 16482ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 16498c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 165020f4b53cSBarry Smith either one (as in Fortran) or zero. 16518c3ff71bSJunchao Zhang 16522ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 16532ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 16542ef1f0ffSBarry Smith allocation. 16558c3ff71bSJunchao Zhang 1656fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 16578c3ff71bSJunchao Zhang @*/ 1658d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1659d71ae5a4SJacob Faibussowitsch { 16608c3ff71bSJunchao Zhang PetscFunctionBegin; 16619566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 16629566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16639566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 16649566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 16659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 16663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16678c3ff71bSJunchao Zhang } 1668930e68a5SMark Adams 1669aac854edSJunchao Zhang // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1670aac854edSJunchao Zhang // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1671aac854edSJunchao Zhang // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1672aac854edSJunchao Zhang // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1673aac854edSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1674d71ae5a4SJacob Faibussowitsch { 167586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1676aac854edSJunchao Zhang const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1677aac854edSJunchao Zhang const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 167886a27549SJunchao Zhang 167986a27549SJunchao Zhang PetscFunctionBegin; 1680aac854edSJunchao Zhang if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1681aac854edSJunchao Zhang if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1682aac854edSJunchao Zhang if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 168386a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 168486a27549SJunchao Zhang } 16853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168686a27549SJunchao Zhang } 168786a27549SJunchao Zhang 1688d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1689d71ae5a4SJacob Faibussowitsch { 1690aac854edSJunchao Zhang const PetscInt n = A->rmap->n; 169186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1692aac854edSJunchao Zhang const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1693aac854edSJunchao Zhang const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 169486a27549SJunchao Zhang 169586a27549SJunchao Zhang PetscFunctionBegin; 1696aac854edSJunchao Zhang if (!factors->transpose_updated) { 1697aac854edSJunchao Zhang if (has_upper) { 1698aac854edSJunchao Zhang if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1699aac854edSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 17007b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 17017b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1702aac854edSJunchao Zhang } 170386a27549SJunchao Zhang 1704aac854edSJunchao Zhang if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1705aac854edSJunchao Zhang if (!factors->U) { 1706aac854edSJunchao Zhang Mat_SeqAIJ *seq; 170786a27549SJunchao Zhang 1708aac854edSJunchao Zhang PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1709aac854edSJunchao Zhang PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 171086a27549SJunchao Zhang 1711aac854edSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1712aac854edSJunchao Zhang factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1713aac854edSJunchao Zhang factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1714aac854edSJunchao Zhang factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1715aac854edSJunchao Zhang } else { 1716aac854edSJunchao Zhang PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1717aac854edSJunchao Zhang } 1718aac854edSJunchao Zhang // Copy Ut from host to device 1719aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1720aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1721aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1722aac854edSJunchao Zhang } else { // If U was computed on device, we also compute the transpose there 1723aac854edSJunchao Zhang // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1724aac854edSJunchao Zhang PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1725aac854edSJunchao Zhang factors->jU_d, factors->aU_d, 1726aac854edSJunchao Zhang factors->iUt_d, factors->jUt_d, 1727aac854edSJunchao Zhang factors->aUt_d)); 1728aac854edSJunchao Zhang PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1729aac854edSJunchao Zhang } 1730aac854edSJunchao Zhang PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1731aac854edSJunchao Zhang } 1732aac854edSJunchao Zhang 1733aac854edSJunchao Zhang // do the same for L with LU 1734aac854edSJunchao Zhang if (has_lower) { 1735aac854edSJunchao Zhang if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1736aac854edSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1737aac854edSJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1738aac854edSJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1739aac854edSJunchao Zhang } 1740aac854edSJunchao Zhang 1741aac854edSJunchao Zhang if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1742aac854edSJunchao Zhang if (!factors->L) { 1743aac854edSJunchao Zhang Mat_SeqAIJ *seq; 1744aac854edSJunchao Zhang 1745aac854edSJunchao Zhang PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1746aac854edSJunchao Zhang PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1747aac854edSJunchao Zhang 1748aac854edSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1749aac854edSJunchao Zhang factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1750aac854edSJunchao Zhang factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1751aac854edSJunchao Zhang factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1752aac854edSJunchao Zhang } else { 1753aac854edSJunchao Zhang PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1754aac854edSJunchao Zhang } 1755aac854edSJunchao Zhang // Copy Lt from host to device 1756aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1757aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1758aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1759aac854edSJunchao Zhang } else { // If L was computed on device, we also compute the transpose there 1760aac854edSJunchao Zhang // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1761aac854edSJunchao Zhang PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1762aac854edSJunchao Zhang factors->jL_d, factors->aL_d, 1763aac854edSJunchao Zhang factors->iLt_d, factors->jLt_d, 1764aac854edSJunchao Zhang factors->aLt_d)); 1765aac854edSJunchao Zhang PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1766aac854edSJunchao Zhang } 1767aac854edSJunchao Zhang PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1768aac854edSJunchao Zhang } 1769aac854edSJunchao Zhang 177086a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 177186a27549SJunchao Zhang } 17723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177386a27549SJunchao Zhang } 177486a27549SJunchao Zhang 1775aac854edSJunchao Zhang // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1776aac854edSJunchao Zhang // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1777aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1778d71ae5a4SJacob Faibussowitsch { 1779aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 178086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1781aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1782aac854edSJunchao Zhang PetscScalarKokkosView D = factors->D_d; 1783aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1784aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1785aac854edSJunchao Zhang PetscScalarKokkosView x; 1786aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1787aac854edSJunchao Zhang PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 178886a27549SJunchao Zhang 178986a27549SJunchao Zhang PetscFunctionBegin; 17909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1791aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1792aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1793aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1794aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 1795aac854edSJunchao Zhang 1796aac854edSJunchao Zhang // Solve U^T Y = B 1797aac854edSJunchao Zhang if (identity) { // Reorder b with the row permutation 1798aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1799aac854edSJunchao Zhang Y = factors->workVector; 1800aac854edSJunchao Zhang } else { 1801aac854edSJunchao Zhang B = factors->workVector; 1802aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1803aac854edSJunchao Zhang Y = x; 1804aac854edSJunchao Zhang } 1805aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1806aac854edSJunchao Zhang 1807aac854edSJunchao Zhang // Solve diag(D) Y' = Y. 1808aac854edSJunchao Zhang // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1809aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1810aac854edSJunchao Zhang 1811aac854edSJunchao Zhang // Solve UX = Y 1812aac854edSJunchao Zhang if (identity) { 1813aac854edSJunchao Zhang X = x; 1814aac854edSJunchao Zhang } else { 1815aac854edSJunchao Zhang X = factors->workVector; // B is not needed anymore 1816aac854edSJunchao Zhang } 1817aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1818aac854edSJunchao Zhang 1819aac854edSJunchao Zhang // Reorder X with the inverse column (row) permutation 18203a7d0413SPierre Jolivet if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1821aac854edSJunchao Zhang 1822aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1823aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 18249566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 18253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 182686a27549SJunchao Zhang } 182786a27549SJunchao Zhang 1828aac854edSJunchao Zhang // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1829aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors. 1830aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1831aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1832d71ae5a4SJacob Faibussowitsch { 1833aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 183486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1835aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1836aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1837aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1838aac854edSJunchao Zhang PetscScalarKokkosView x; 1839aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1840aac854edSJunchao Zhang PetscIntKokkosView &colperm = factors->colperm; 1841aac854edSJunchao Zhang PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1842aac854edSJunchao Zhang PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 184386a27549SJunchao Zhang 184486a27549SJunchao Zhang PetscFunctionBegin; 18459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1846aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1847aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1848aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 184986a27549SJunchao Zhang 1850aac854edSJunchao Zhang // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1851aac854edSJunchao Zhang if (row_identity) { 1852aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1853aac854edSJunchao Zhang Y = factors->workVector; 1854aac854edSJunchao Zhang } else { 1855aac854edSJunchao Zhang B = factors->workVector; 1856aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1857aac854edSJunchao Zhang Y = x; 1858aac854edSJunchao Zhang } 1859aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1860aac854edSJunchao Zhang 1861aac854edSJunchao Zhang // Solve U C^- x = Y 1862aac854edSJunchao Zhang if (col_identity) { 1863aac854edSJunchao Zhang X = x; 1864aac854edSJunchao Zhang } else { 1865aac854edSJunchao Zhang X = factors->workVector; 1866aac854edSJunchao Zhang } 1867aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1868aac854edSJunchao Zhang 1869aac854edSJunchao Zhang // x = C X; Reorder X with the inverse col permutation 18703a7d0413SPierre Jolivet if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1871aac854edSJunchao Zhang 1872aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1873aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 18749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 18753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 187686a27549SJunchao Zhang } 187786a27549SJunchao Zhang 1878aac854edSJunchao Zhang // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1879aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors. 1880aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1881aac854edSJunchao Zhang // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1882aac854edSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1883aac854edSJunchao Zhang { 1884aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1885aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1886aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1887aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1888aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1889aac854edSJunchao Zhang PetscScalarKokkosView x; 1890aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1891aac854edSJunchao Zhang PetscIntKokkosView &colperm = factors->colperm; 1892aac854edSJunchao Zhang PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1893aac854edSJunchao Zhang PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1894aac854edSJunchao Zhang 1895aac854edSJunchao Zhang PetscFunctionBegin; 1896aac854edSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1897aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1898aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1899aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 1900aac854edSJunchao Zhang 1901aac854edSJunchao Zhang // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1902aac854edSJunchao Zhang if (col_identity) { // Reorder b with the col permutation 1903aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1904aac854edSJunchao Zhang Y = factors->workVector; 1905aac854edSJunchao Zhang } else { 1906aac854edSJunchao Zhang B = factors->workVector; 1907aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1908aac854edSJunchao Zhang Y = x; 1909aac854edSJunchao Zhang } 1910aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1911aac854edSJunchao Zhang 1912aac854edSJunchao Zhang // Solve L^T X = Y 1913aac854edSJunchao Zhang if (row_identity) { 1914aac854edSJunchao Zhang X = x; 1915aac854edSJunchao Zhang } else { 1916aac854edSJunchao Zhang X = factors->workVector; 1917aac854edSJunchao Zhang } 1918aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1919aac854edSJunchao Zhang 1920aac854edSJunchao Zhang // x = R^- X = R^T X; Reorder X with the inverse row permutation 19213a7d0413SPierre Jolivet if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1922aac854edSJunchao Zhang 1923aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1924aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1925aac854edSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1926aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1927aac854edSJunchao Zhang } 1928aac854edSJunchao Zhang 1929aac854edSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1930aac854edSJunchao Zhang { 1931aac854edSJunchao Zhang PetscFunctionBegin; 1932aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSyncHost(A)); 1933aac854edSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1934aac854edSJunchao Zhang 1935aac854edSJunchao Zhang if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1936aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1937aac854edSJunchao Zhang Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1938aac854edSJunchao Zhang const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1939aac854edSJunchao Zhang const MatScalar *Ba = b->a; 1940aac854edSJunchao Zhang PetscInt m = B->rmap->n, n = B->cmap->n; 1941aac854edSJunchao Zhang 1942aac854edSJunchao Zhang if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1943aac854edSJunchao Zhang // Allocate memory and copy the structure 1944aac854edSJunchao Zhang factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1945aac854edSJunchao Zhang factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1946aac854edSJunchao Zhang factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1947aac854edSJunchao Zhang factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1948aac854edSJunchao Zhang factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1949aac854edSJunchao Zhang factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1950aac854edSJunchao Zhang 1951aac854edSJunchao Zhang PetscInt *Li = factors->iL_h.data(); 1952aac854edSJunchao Zhang PetscInt *Lj = factors->jL_h.data(); 1953aac854edSJunchao Zhang PetscInt *Ui = factors->iU_h.data(); 1954aac854edSJunchao Zhang PetscInt *Uj = factors->jU_h.data(); 1955aac854edSJunchao Zhang 1956aac854edSJunchao Zhang Li[0] = Ui[0] = 0; 1957aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1958aac854edSJunchao Zhang PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1959aac854edSJunchao Zhang PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1960aac854edSJunchao Zhang 1961*64dc1d19SNuno Nobre PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1962aac854edSJunchao Zhang Lj[Li[i] + llen] = i; // diagonal entry of L 1963aac854edSJunchao Zhang 1964aac854edSJunchao Zhang Uj[Ui[i]] = i; // diagonal entry of U 1965*64dc1d19SNuno Nobre PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1966aac854edSJunchao Zhang 1967aac854edSJunchao Zhang Li[i + 1] = Li[i] + llen + 1; 1968aac854edSJunchao Zhang Ui[i + 1] = Ui[i] + ulen; 1969aac854edSJunchao Zhang } 1970aac854edSJunchao Zhang 1971aac854edSJunchao Zhang factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1972aac854edSJunchao Zhang factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1973aac854edSJunchao Zhang factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1974aac854edSJunchao Zhang factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1975aac854edSJunchao Zhang factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1976aac854edSJunchao Zhang factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1977aac854edSJunchao Zhang 1978aac854edSJunchao Zhang // Copy row/col permutation to device 1979aac854edSJunchao Zhang IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1980aac854edSJunchao Zhang PetscBool row_identity; 1981aac854edSJunchao Zhang PetscCall(ISIdentity(rowperm, &row_identity)); 1982aac854edSJunchao Zhang if (!row_identity) { 1983aac854edSJunchao Zhang const PetscInt *ip; 1984aac854edSJunchao Zhang 1985aac854edSJunchao Zhang PetscCall(ISGetIndices(rowperm, &ip)); 1986aac854edSJunchao Zhang factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1987aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1988aac854edSJunchao Zhang PetscCall(ISRestoreIndices(rowperm, &ip)); 1989aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1990aac854edSJunchao Zhang } 1991aac854edSJunchao Zhang 1992aac854edSJunchao Zhang IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1993aac854edSJunchao Zhang PetscBool col_identity; 1994aac854edSJunchao Zhang PetscCall(ISIdentity(colperm, &col_identity)); 1995aac854edSJunchao Zhang if (!col_identity) { 1996aac854edSJunchao Zhang const PetscInt *ip; 1997aac854edSJunchao Zhang 1998aac854edSJunchao Zhang PetscCall(ISGetIndices(colperm, &ip)); 1999aac854edSJunchao Zhang factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 2000aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 2001aac854edSJunchao Zhang PetscCall(ISRestoreIndices(colperm, &ip)); 2002aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2003aac854edSJunchao Zhang } 2004aac854edSJunchao Zhang 2005aac854edSJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 2006aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2007aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2008aac854edSJunchao Zhang #else 2009aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2010aac854edSJunchao Zhang #endif 2011aac854edSJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2012aac854edSJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2013aac854edSJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2014aac854edSJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2015aac854edSJunchao Zhang } 2016aac854edSJunchao Zhang 2017aac854edSJunchao Zhang // Copy the value 2018aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2019aac854edSJunchao Zhang PetscInt llen = Bi[i + 1] - Bi[i]; 2020aac854edSJunchao Zhang PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2021aac854edSJunchao Zhang const PetscInt *Li = factors->iL_h.data(); 2022aac854edSJunchao Zhang const PetscInt *Ui = factors->iU_h.data(); 2023aac854edSJunchao Zhang 2024aac854edSJunchao Zhang PetscScalar *La = factors->aL_h.data(); 2025aac854edSJunchao Zhang PetscScalar *Ua = factors->aU_h.data(); 2026aac854edSJunchao Zhang 2027*64dc1d19SNuno Nobre PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2028aac854edSJunchao Zhang La[Li[i] + llen] = 1.0; // diagonal entry 2029aac854edSJunchao Zhang 2030aac854edSJunchao Zhang Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 2031*64dc1d19SNuno Nobre PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2032aac854edSJunchao Zhang } 2033aac854edSJunchao Zhang 2034aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2035aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2036aac854edSJunchao Zhang // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2037aac854edSJunchao Zhang factors->transpose_updated = PETSC_FALSE; 2038aac854edSJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 2039aac854edSJunchao Zhang 2040aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2041aac854edSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2042aac854edSJunchao Zhang } 2043aac854edSJunchao Zhang 2044aac854edSJunchao Zhang B->ops->matsolve = NULL; 2045aac854edSJunchao Zhang B->ops->matsolvetranspose = NULL; 2046aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2047aac854edSJunchao Zhang } 2048aac854edSJunchao Zhang 2049aac854edSJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2050d71ae5a4SJacob Faibussowitsch { 205186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 205286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 205386a27549SJunchao Zhang PetscInt fill_lev = info->levels; 205486a27549SJunchao Zhang 205586a27549SJunchao Zhang PetscFunctionBegin; 20569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2057aac854edSJunchao Zhang PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 20589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2059076ba34aSJunchao Zhang 2060076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 2061076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 2062076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 2063076ba34aSJunchao Zhang 2064aac854edSJunchao Zhang PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 206586a27549SJunchao Zhang 206686a27549SJunchao Zhang B->assembled = PETSC_TRUE; 206786a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 2068aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2069aac854edSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 207086a27549SJunchao Zhang B->ops->matsolve = NULL; 207186a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 207286a27549SJunchao Zhang 207386a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 207486a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 207586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 2076eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 20779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 20783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 207986a27549SJunchao Zhang } 208086a27549SJunchao Zhang 2081aac854edSJunchao Zhang // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2082aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2083d71ae5a4SJacob Faibussowitsch { 208486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 208586a27549SJunchao Zhang Mat_SeqAIJ *b; 208686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 208786a27549SJunchao Zhang PetscInt fill_lev = info->levels; 208886a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 208986a27549SJunchao Zhang PetscInt n = A->rmap->n; 209086a27549SJunchao Zhang 209186a27549SJunchao Zhang PetscFunctionBegin; 2092aac854edSJunchao Zhang PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 20939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 209486a27549SJunchao Zhang 209586a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 209686a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2097aac854edSJunchao Zhang factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 209886a27549SJunchao Zhang 209986a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 210086a27549SJunchao Zhang 210186a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 210286a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 210386a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 210486a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 210586a27549SJunchao Zhang 210686a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2107076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 2108076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 2109aac854edSJunchao Zhang PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 211086a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 211186a27549SJunchao Zhang 211286a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 211386a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 211486a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 211586a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 211686a27549SJunchao Zhang 211786a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 211886a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 211986a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2120aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 212186a27549SJunchao Zhang #else 2122aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 212386a27549SJunchao Zhang #endif 212486a27549SJunchao Zhang 212586a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 212686a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 212786a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 212886a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 212986a27549SJunchao Zhang 213086a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 21319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 213286a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 213386a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 213486a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 2135a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 213686a27549SJunchao Zhang 2137aac854edSJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 21383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2139930e68a5SMark Adams } 2140930e68a5SMark Adams 2141aac854edSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2142aac854edSJunchao Zhang { 2143aac854edSJunchao Zhang PetscFunctionBegin; 2144aac854edSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2145aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2146aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2147aac854edSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2148aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2149aac854edSJunchao Zhang } 2150aac854edSJunchao Zhang 2151aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2152aac854edSJunchao Zhang { 2153aac854edSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2154aac854edSJunchao Zhang 2155aac854edSJunchao Zhang PetscFunctionBegin; 2156aac854edSJunchao Zhang if (!info->factoronhost) { 2157aac854edSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2158aac854edSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2159aac854edSJunchao Zhang } 2160aac854edSJunchao Zhang 2161aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2162aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2163aac854edSJunchao Zhang 2164aac854edSJunchao Zhang if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2165aac854edSJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2166aac854edSJunchao Zhang } else { 2167aac854edSJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2168aac854edSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2169aac854edSJunchao Zhang } 2170aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2171aac854edSJunchao Zhang } 2172aac854edSJunchao Zhang 2173aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2174aac854edSJunchao Zhang { 2175aac854edSJunchao Zhang PetscFunctionBegin; 2176aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSyncHost(A)); 2177aac854edSJunchao Zhang PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2178aac854edSJunchao Zhang 2179aac854edSJunchao Zhang if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2180aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2181aac854edSJunchao Zhang Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2182aac854edSJunchao Zhang const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2183aac854edSJunchao Zhang const MatScalar *Ba = b->a; 2184aac854edSJunchao Zhang PetscInt m = B->rmap->n; 2185aac854edSJunchao Zhang 2186aac854edSJunchao Zhang if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2187aac854edSJunchao Zhang // Allocate memory and copy the structure 2188aac854edSJunchao Zhang factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2189aac854edSJunchao Zhang factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2190aac854edSJunchao Zhang factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2191aac854edSJunchao Zhang factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2192aac854edSJunchao Zhang factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2193aac854edSJunchao Zhang factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2194aac854edSJunchao Zhang 2195aac854edSJunchao Zhang // Build jU_h from the skewed Aj 2196aac854edSJunchao Zhang PetscInt *Uj = factors->jU_h.data(); 2197aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2198aac854edSJunchao Zhang PetscInt ulen = Bi[i + 1] - Bi[i]; 2199aac854edSJunchao Zhang Uj[Bi[i]] = i; // diagonal entry 2200aac854edSJunchao Zhang PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2201aac854edSJunchao Zhang } 2202aac854edSJunchao Zhang 2203aac854edSJunchao Zhang // Copy iU, jU to device 2204aac854edSJunchao Zhang PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2205aac854edSJunchao Zhang PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2206aac854edSJunchao Zhang 2207aac854edSJunchao Zhang // Copy row/col permutation to device 2208aac854edSJunchao Zhang IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2209aac854edSJunchao Zhang PetscBool row_identity; 2210aac854edSJunchao Zhang PetscCall(ISIdentity(rowperm, &row_identity)); 2211aac854edSJunchao Zhang if (!row_identity) { 2212aac854edSJunchao Zhang const PetscInt *ip; 2213aac854edSJunchao Zhang 2214aac854edSJunchao Zhang PetscCall(ISGetIndices(rowperm, &ip)); 2215aac854edSJunchao Zhang PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2216aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2217aac854edSJunchao Zhang PetscCall(ISRestoreIndices(rowperm, &ip)); 2218aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2219aac854edSJunchao Zhang } 2220aac854edSJunchao Zhang 2221aac854edSJunchao Zhang // Create sptrsv handles for U and U^T 2222aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2223aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2224aac854edSJunchao Zhang #else 2225aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2226aac854edSJunchao Zhang #endif 2227aac854edSJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2228aac854edSJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2229aac854edSJunchao Zhang } 2230aac854edSJunchao Zhang // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2231aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2232aac854edSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2233aac854edSJunchao Zhang 2234aac854edSJunchao Zhang // Copy the value 2235aac854edSJunchao Zhang PetscScalar *Ua = factors->aU_h.data(); 2236aac854edSJunchao Zhang PetscScalar *D = factors->D_h.data(); 2237aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2238aac854edSJunchao Zhang D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2239aac854edSJunchao Zhang Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2240aac854edSJunchao Zhang for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2241aac854edSJunchao Zhang } 2242aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2243aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2244aac854edSJunchao Zhang 2245aac854edSJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2246aac854edSJunchao Zhang factors->transpose_updated = PETSC_FALSE; 2247aac854edSJunchao Zhang } 2248aac854edSJunchao Zhang 2249aac854edSJunchao Zhang B->ops->matsolve = NULL; 2250aac854edSJunchao Zhang B->ops->matsolvetranspose = NULL; 2251aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2252aac854edSJunchao Zhang } 2253aac854edSJunchao Zhang 2254aac854edSJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2255aac854edSJunchao Zhang { 2256aac854edSJunchao Zhang PetscFunctionBegin; 2257aac854edSJunchao Zhang if (info->solveonhost) { 2258aac854edSJunchao Zhang // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2259aac854edSJunchao Zhang PetscCall(MatSetType(B, MATSEQSBAIJ)); 2260aac854edSJunchao Zhang PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2261aac854edSJunchao Zhang } 2262aac854edSJunchao Zhang 2263aac854edSJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2264aac854edSJunchao Zhang 2265aac854edSJunchao Zhang if (!info->solveonhost) { 2266bfe80ac4SPierre Jolivet // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2267aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2268aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2269aac854edSJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2270aac854edSJunchao Zhang } 2271aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2272aac854edSJunchao Zhang } 2273aac854edSJunchao Zhang 2274aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2275aac854edSJunchao Zhang { 2276aac854edSJunchao Zhang PetscFunctionBegin; 2277aac854edSJunchao Zhang if (info->solveonhost) { 2278aac854edSJunchao Zhang // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2279aac854edSJunchao Zhang PetscCall(MatSetType(B, MATSEQSBAIJ)); 2280aac854edSJunchao Zhang PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2281aac854edSJunchao Zhang } 2282aac854edSJunchao Zhang 2283aac854edSJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2284aac854edSJunchao Zhang 2285aac854edSJunchao Zhang if (!info->solveonhost) { 2286bfe80ac4SPierre Jolivet // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2287aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2288aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2289aac854edSJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2290aac854edSJunchao Zhang } 2291aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2292aac854edSJunchao Zhang } 2293aac854edSJunchao Zhang 2294aac854edSJunchao Zhang // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2295aac854edSJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2296d71ae5a4SJacob Faibussowitsch { 2297930e68a5SMark Adams PetscFunctionBegin; 2298930e68a5SMark Adams *type = MATSOLVERKOKKOS; 22993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2300930e68a5SMark Adams } 2301930e68a5SMark Adams 2302930e68a5SMark Adams /*MC 230386a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 230411a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2305930e68a5SMark Adams 2306930e68a5SMark Adams Level: beginner 2307930e68a5SMark Adams 23081cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 2309930e68a5SMark Adams M*/ 231086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2311930e68a5SMark Adams { 2312930e68a5SMark Adams PetscInt n = A->rmap->n; 2313aac854edSJunchao Zhang MPI_Comm comm; 2314930e68a5SMark Adams 2315930e68a5SMark Adams PetscFunctionBegin; 2316aac854edSJunchao Zhang PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2317aac854edSJunchao Zhang PetscCall(MatCreate(comm, B)); 23189566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 2319aac854edSJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2320930e68a5SMark Adams (*B)->factortype = ftype; 23219566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 23229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2323aac854edSJunchao Zhang PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2324aac854edSJunchao Zhang 2325aac854edSJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2326aac854edSJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2327aac854edSJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2328aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2329aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2330aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2331aac854edSJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2332aac854edSJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2333aac854edSJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2334aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2335aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2336aac854edSJunchao Zhang } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2337aac854edSJunchao Zhang 2338aac854edSJunchao Zhang // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2339aac854edSJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2340aac854edSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 23413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2342930e68a5SMark Adams } 23438f7e8f9dSMark Adams 2344aac854edSJunchao Zhang PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2345d71ae5a4SJacob Faibussowitsch { 234686a27549SJunchao Zhang PetscFunctionBegin; 23479566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348aac854edSJunchao Zhang PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 23499566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2350aac854edSJunchao Zhang PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 23513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 235286a27549SJunchao Zhang } 235386a27549SJunchao Zhang 2354076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 2355d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2356d71ae5a4SJacob Faibussowitsch { 235745402d8aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 235845402d8aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 235945402d8aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2360076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 2361076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 2362076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 2363076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2364076ba34aSJunchao Zhang 2365076ba34aSJunchao Zhang PetscFunctionBegin; 23669566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2367076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 23689566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 236948a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 23709566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2371076ba34aSJunchao Zhang } 23723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2373076ba34aSJunchao Zhang } 2374