1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp> 211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 3c0c276a7Ssdargavi #include <petscmat_kokkos.hpp> 4076ba34aSJunchao Zhang #include <petscpkg_version.h> 5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 642550becSJunchao Zhang #include <petsc/private/sfimpl.h> 7aac854edSJunchao Zhang #include <petsc/private/kokkosimpl.hpp> 87233ce55SJed Brown #include <petscsys.h> 98c3ff71bSJunchao Zhang 108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 11f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 13cc6e31f1SJunchao Zhang 14cc6e31f1SJunchao Zhang // To suppress compiler warnings: 15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 192695cf96SNuno Nobre #define DISABLE_CUSPARSE_DEPRECATED 208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 21cc6e31f1SJunchao Zhang 2286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 2386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 24076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 25076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 269d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp> 279d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp> 2886a27549SJunchao Zhang 2942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 308c3ff71bSJunchao Zhang 310e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 33f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 349371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 35f98996d3SJunchao Zhang #else 36f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 37f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 389371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 39f98996d3SJunchao Zhang #endif 40f98996d3SJunchao Zhang 41aac854edSJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0) 42aac854edSJunchao Zhang using KokkosSparse::spiluk_symbolic; 43aac854edSJunchao Zhang using KokkosSparse::spiluk_numeric; 44aac854edSJunchao Zhang using KokkosSparse::sptrsv_symbolic; 45aac854edSJunchao Zhang using KokkosSparse::sptrsv_solve; 46aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm; 47aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm; 48aac854edSJunchao Zhang #else 49aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_symbolic; 50aac854edSJunchao Zhang using KokkosSparse::Experimental::spiluk_numeric; 51aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_symbolic; 52aac854edSJunchao Zhang using KokkosSparse::Experimental::sptrsv_solve; 53aac854edSJunchao Zhang using KokkosSparse::Experimental::SPTRSVAlgorithm; 54aac854edSJunchao Zhang using KokkosSparse::Experimental::SPILUKAlgorithm; 55aac854edSJunchao Zhang #endif 56aac854edSJunchao Zhang 578c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 588c3ff71bSJunchao Zhang 59076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 60076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 61076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 62076ba34aSJunchao Zhang */ 63d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 64d71ae5a4SJacob Faibussowitsch { 65076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 66076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 678c3ff71bSJunchao Zhang 688c3ff71bSJunchao Zhang PetscFunctionBegin; 693ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 709566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 71076ba34aSJunchao Zhang 72076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 73076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 74076ba34aSJunchao Zhang 75076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 76076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 77076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 78076ba34aSJunchao Zhang */ 79076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 8093a54799SPierre Jolivet if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */ 81d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i)); 82d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j)); 83d1c799ffSJunchao Zhang PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a)); 84d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1)); 85d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz())); 86d1c799ffSJunchao Zhang PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz())); 87d1c799ffSJunchao Zhang aijseq->free_a = PETSC_TRUE; 88d1c799ffSJunchao Zhang aijseq->free_ij = PETSC_TRUE; 89d1c799ffSJunchao Zhang /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where 90d1c799ffSJunchao Zhang we have the CsrMatrix on device first and then copy to host, followed by 91d1c799ffSJunchao Zhang MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL. 92d1c799ffSJunchao Zhang One could improve it by not using NULL garray. 93d1c799ffSJunchao Zhang */ 94d1c799ffSJunchao Zhang } 95076ba34aSJunchao Zhang delete aijkok; 96f4747e26SJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 97076ba34aSJunchao Zhang A->spptr = aijkok; 98f4747e26SJunchao Zhang } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 99f4747e26SJunchao Zhang MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 100f4747e26SJunchao Zhang auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 101f4747e26SJunchao Zhang aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 102076ba34aSJunchao Zhang } 1033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1048c3ff71bSJunchao Zhang } 1058c3ff71bSJunchao Zhang 10686a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 107d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 108d71ae5a4SJacob Faibussowitsch { 1098c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1108c3ff71bSJunchao Zhang 1118c3ff71bSJunchao Zhang PetscFunctionBegin; 112aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 1135f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 114076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 115f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 116580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 11786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1188c3ff71bSJunchao Zhang } 1193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1208c3ff71bSJunchao Zhang } 1218c3ff71bSJunchao Zhang 122076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 123d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 124d71ae5a4SJacob Faibussowitsch { 12586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 12686a27549SJunchao Zhang 12786a27549SJunchao Zhang PetscFunctionBegin; 1285f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 12986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 13086a27549SJunchao Zhang aijkok->a_dual.modify_device(); 13186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 13286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1339566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13586a27549SJunchao Zhang } 13686a27549SJunchao Zhang 137d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 138d71ae5a4SJacob Faibussowitsch { 139f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1404df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 141f0cf5187SStefano Zampini 142f0cf5187SStefano Zampini PetscFunctionBegin; 143f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 14486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 145aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1465f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 147f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec)); 1483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 149f0cf5187SStefano Zampini } 150f0cf5187SStefano Zampini 151d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 152d71ae5a4SJacob Faibussowitsch { 153076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 154f0cf5187SStefano Zampini 155f0cf5187SStefano Zampini PetscFunctionBegin; 1565519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1575519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1585519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1595519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1605519a089SJose E. Roman */ 1615519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 162f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 163076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 164076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 165076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 166076ba34aSJunchao Zhang } 1673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168076ba34aSJunchao Zhang } 169076ba34aSJunchao Zhang 170d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171d71ae5a4SJacob Faibussowitsch { 172076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173076ba34aSJunchao Zhang 174076ba34aSJunchao Zhang PetscFunctionBegin; 1755519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177076ba34aSJunchao Zhang } 178076ba34aSJunchao Zhang 179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 180d71ae5a4SJacob Faibussowitsch { 181076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182076ba34aSJunchao Zhang 183076ba34aSJunchao Zhang PetscFunctionBegin; 1845519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 186076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1872328674fSJunchao Zhang } else { 1882328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1892328674fSJunchao Zhang } 1903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 191076ba34aSJunchao Zhang } 192076ba34aSJunchao Zhang 193d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 194d71ae5a4SJacob Faibussowitsch { 195076ba34aSJunchao Zhang PetscFunctionBegin; 196076ba34aSJunchao Zhang *array = NULL; 1973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 198076ba34aSJunchao Zhang } 199076ba34aSJunchao Zhang 200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 201d71ae5a4SJacob Faibussowitsch { 202076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203076ba34aSJunchao Zhang 204076ba34aSJunchao Zhang PetscFunctionBegin; 2055519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 206076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 2072328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 2082328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 2092328674fSJunchao Zhang } 2103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 211076ba34aSJunchao Zhang } 212076ba34aSJunchao Zhang 213d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 214d71ae5a4SJacob Faibussowitsch { 215076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216076ba34aSJunchao Zhang 217076ba34aSJunchao Zhang PetscFunctionBegin; 2185519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 219076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 220076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 2212328674fSJunchao Zhang } 2223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 223f0cf5187SStefano Zampini } 224f0cf5187SStefano Zampini 225d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 226d71ae5a4SJacob Faibussowitsch { 2277ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2287ee59b9bSJunchao Zhang 2297ee59b9bSJunchao Zhang PetscFunctionBegin; 2307ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 2317ee59b9bSJunchao Zhang 2327ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 2337ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 2347ee59b9bSJunchao Zhang if (a) { 235f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace())); 2367ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 2377ee59b9bSJunchao Zhang } 2387ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 2393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2407ee59b9bSJunchao Zhang } 2417ee59b9bSJunchao Zhang 24203db1824SAlex Lindsay static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype) 24303db1824SAlex Lindsay { 24403db1824SAlex Lindsay PetscFunctionBegin; 24503db1824SAlex Lindsay *mtype = PETSC_MEMTYPE_KOKKOS; 24603db1824SAlex Lindsay PetscFunctionReturn(PETSC_SUCCESS); 24703db1824SAlex Lindsay } 24803db1824SAlex Lindsay 2490e3ece09SJunchao Zhang /* 2500e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2510e3ece09SJunchao Zhang 2520e3ece09SJunchao Zhang Input Parameter: 2530e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2540e3ece09SJunchao Zhang 2550e3ece09SJunchao Zhang Output Parameters: 2560e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 257aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2580e3ece09SJunchao Zhang */ 2590e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 260d71ae5a4SJacob Faibussowitsch { 2610e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2620e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2630e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2647b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2650e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2667b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2677b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2680e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2690e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2700e3ece09SJunchao Zhang PetscInt *offset; 271152b3e56SJunchao Zhang 272152b3e56SJunchao Zhang PetscFunctionBegin; 2730e3ece09SJunchao Zhang // Populate Ti 2740e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2750e3ece09SJunchao Zhang Ti++; 2760e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2770e3ece09SJunchao Zhang Ti--; 2780e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2790e3ece09SJunchao Zhang 2800e3ece09SJunchao Zhang // Populate Tj and the permutation array 2810e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2820e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2830e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2840e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2850e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2860e3ece09SJunchao Zhang 2870e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2880e3ece09SJunchao Zhang perm[disp] = j; 2890e3ece09SJunchao Zhang offset[r]++; 290076ba34aSJunchao Zhang } 2910e3ece09SJunchao Zhang } 2920e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2930e3ece09SJunchao Zhang 2940e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2950e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2960e3ece09SJunchao Zhang 2970e3ece09SJunchao Zhang // Output perm and T on device 2980e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2990e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 3000e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 3010e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 3023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 303152b3e56SJunchao Zhang } 304152b3e56SJunchao Zhang 3050e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 3060e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 3070e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 308d71ae5a4SJacob Faibussowitsch { 3090e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3100e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3110e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3120e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 313152b3e56SJunchao Zhang 314152b3e56SJunchao Zhang PetscFunctionBegin; 3150e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 316f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 3170e3ece09SJunchao Zhang 3180e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3190e3ece09SJunchao Zhang 3200e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 3210e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 3220e3ece09SJunchao Zhang } else { 3230e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 3240e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3250e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 3260e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3270e3ece09SJunchao Zhang auto &Ta = T.values; 3280e3ece09SJunchao Zhang 329d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 330076ba34aSJunchao Zhang } 3310e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3320e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3330e3ece09SJunchao Zhang 3340e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3350e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 336d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 3370e3ece09SJunchao Zhang } 3380e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 3390e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 3400e3ece09SJunchao Zhang } 3410e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3420e3ece09SJunchao Zhang } 3430e3ece09SJunchao Zhang 3440e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3450e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3460e3ece09SJunchao Zhang { 3470e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3480e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3490e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3500e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3510e3ece09SJunchao Zhang 3520e3ece09SJunchao Zhang PetscFunctionBegin; 3530e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 354f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device 3550e3ece09SJunchao Zhang 3560e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3570e3ece09SJunchao Zhang 3580e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3590e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3600e3ece09SJunchao Zhang } else { 3610e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3620e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3630e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3640e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3650e3ece09SJunchao Zhang auto &Ta = T.values; 3660e3ece09SJunchao Zhang 367d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3680e3ece09SJunchao Zhang } 3690e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3700e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3710e3ece09SJunchao Zhang 3720e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3730e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 374d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3750e3ece09SJunchao Zhang } 3760e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3770e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3780e3ece09SJunchao Zhang } 3793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 380152b3e56SJunchao Zhang } 381a587d139SMark 3828c3ff71bSJunchao Zhang /* y = A x */ 383d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 384d71ae5a4SJacob Faibussowitsch { 3858c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 386152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 387152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3888c3ff71bSJunchao Zhang 3898c3ff71bSJunchao Zhang PetscFunctionBegin; 3909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3929566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3939566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3948c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 395d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3969566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 398076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4028c3ff71bSJunchao Zhang } 4038c3ff71bSJunchao Zhang 4048c3ff71bSJunchao Zhang /* y = A^T x */ 405d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 406d71ae5a4SJacob Faibussowitsch { 4078c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 408152b3e56SJunchao Zhang const char *mode; 409152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 410152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4110e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4128c3ff71bSJunchao Zhang 4138c3ff71bSJunchao Zhang PetscFunctionBegin; 4149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4169566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4179566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 418152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 420152b3e56SJunchao Zhang mode = "N"; 421152b3e56SJunchao Zhang } else { 422076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4230e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 424152b3e56SJunchao Zhang mode = "T"; 425152b3e56SJunchao Zhang } 426d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 4279566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4289566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4290e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4309566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4328c3ff71bSJunchao Zhang } 4338c3ff71bSJunchao Zhang 4348c3ff71bSJunchao Zhang /* y = A^H x */ 435d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 436d71ae5a4SJacob Faibussowitsch { 4378c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 438152b3e56SJunchao Zhang const char *mode; 439152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 440152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4410e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4428c3ff71bSJunchao Zhang 4438c3ff71bSJunchao Zhang PetscFunctionBegin; 4449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4469566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4479566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 448152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 450152b3e56SJunchao Zhang mode = "N"; 451152b3e56SJunchao Zhang } else { 452076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4530e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 454152b3e56SJunchao Zhang mode = "C"; 455152b3e56SJunchao Zhang } 456d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4579566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4589566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4590e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4628c3ff71bSJunchao Zhang } 4638c3ff71bSJunchao Zhang 4648c3ff71bSJunchao Zhang /* z = A x + y */ 465d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 466d71ae5a4SJacob Faibussowitsch { 4678c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 46892896123SJunchao Zhang ConstPetscScalarKokkosView xv; 469152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4708c3ff71bSJunchao Zhang 4718c3ff71bSJunchao Zhang PetscFunctionBegin; 4729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 47492896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 4759566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 47692896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 4778c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 478d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 48092896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4848c3ff71bSJunchao Zhang } 4858c3ff71bSJunchao Zhang 4868c3ff71bSJunchao Zhang /* z = A^T x + y */ 487d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 488d71ae5a4SJacob Faibussowitsch { 4898c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 490152b3e56SJunchao Zhang const char *mode; 49192896123SJunchao Zhang ConstPetscScalarKokkosView xv; 492152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4930e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4948c3ff71bSJunchao Zhang 4958c3ff71bSJunchao Zhang PetscFunctionBegin; 4969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 49892896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4999566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 50092896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 501152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 503152b3e56SJunchao Zhang mode = "N"; 504152b3e56SJunchao Zhang } else { 505076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5060e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 507152b3e56SJunchao Zhang mode = "T"; 508152b3e56SJunchao Zhang } 509d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 5109566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 51192896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5120e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5158c3ff71bSJunchao Zhang } 5168c3ff71bSJunchao Zhang 5178c3ff71bSJunchao Zhang /* z = A^H x + y */ 518d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 519d71ae5a4SJacob Faibussowitsch { 5208c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 521152b3e56SJunchao Zhang const char *mode; 52292896123SJunchao Zhang ConstPetscScalarKokkosView xv; 523152b3e56SJunchao Zhang PetscScalarKokkosView zv; 5240e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 5258c3ff71bSJunchao Zhang 5268c3ff71bSJunchao Zhang PetscFunctionBegin; 5279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 52992896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 5309566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 53192896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 532152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 534152b3e56SJunchao Zhang mode = "N"; 535152b3e56SJunchao Zhang } else { 536076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5370e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 538152b3e56SJunchao Zhang mode = "C"; 539152b3e56SJunchao Zhang } 540d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5419566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 54292896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5430e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5449566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 546152b3e56SJunchao Zhang } 547152b3e56SJunchao Zhang 54866976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 549d71ae5a4SJacob Faibussowitsch { 550152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 551152b3e56SJunchao Zhang 552152b3e56SJunchao Zhang PetscFunctionBegin; 553152b3e56SJunchao Zhang switch (op) { 554152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 555152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5569566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 557152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 558152b3e56SJunchao Zhang break; 559d71ae5a4SJacob Faibussowitsch default: 560d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 561d71ae5a4SJacob Faibussowitsch break; 562152b3e56SJunchao Zhang } 5633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5648c3ff71bSJunchao Zhang } 5658c3ff71bSJunchao Zhang 566076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 567d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 568d71ae5a4SJacob Faibussowitsch { 569076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5708c3ff71bSJunchao Zhang 5718c3ff71bSJunchao Zhang PetscFunctionBegin; 5729566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 573076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 57451ece73cSJunchao Zhang PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); 57551ece73cSJunchao Zhang PetscCall(MatSetType(*newmat, mtype)); 5768c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5779566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 578076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5795f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5809566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5819566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5829566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5839566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 584076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 585394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5865f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 587f4747e26SJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 5888c3ff71bSJunchao Zhang } 589076ba34aSJunchao Zhang } 5903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5918c3ff71bSJunchao Zhang } 5928c3ff71bSJunchao Zhang 593076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 594076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 595076ba34aSJunchao Zhang */ 596d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 597d71ae5a4SJacob Faibussowitsch { 598076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 599076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 600076ba34aSJunchao Zhang Mat mat; 6018c3ff71bSJunchao Zhang 6028c3ff71bSJunchao Zhang PetscFunctionBegin; 603076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 6049566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 605076ba34aSJunchao Zhang mat = *B; 606f4747e26SJunchao Zhang if (A->assembled) { 607076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 608f4747e26SJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 609076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 610076ba34aSJunchao Zhang /* Now copy values to B if needed */ 611076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 612076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 613076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 614076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 615076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 616076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 617076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 618076ba34aSJunchao Zhang } 619076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 620076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 621076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 622076ba34aSJunchao Zhang } 623076ba34aSJunchao Zhang mat->spptr = bkok; 624076ba34aSJunchao Zhang } 625076ba34aSJunchao Zhang 6269566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 6279566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 6289566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 6299566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 6303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6318c3ff71bSJunchao Zhang } 6328c3ff71bSJunchao Zhang 633d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 634d71ae5a4SJacob Faibussowitsch { 6350ecb592aSJunchao Zhang Mat At; 6360e3ece09SJunchao Zhang KokkosCsrMatrix internT; 6370ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 6380ecb592aSJunchao Zhang 6390ecb592aSJunchao Zhang PetscFunctionBegin; 6407fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 6419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6420ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 643ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6440e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6459566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6460ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6479566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6480ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6490ecb592aSJunchao Zhang if ((*B)->assembled) { 6500ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6510e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6530ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6540ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6550e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6560e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6570e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6580e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6590ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6600ecb592aSJunchao Zhang } 6613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6620ecb592aSJunchao Zhang } 6630ecb592aSJunchao Zhang 664d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 665d71ae5a4SJacob Faibussowitsch { 66686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6678c3ff71bSJunchao Zhang 6688c3ff71bSJunchao Zhang PetscFunctionBegin; 66986a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 67086a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6718c3ff71bSJunchao Zhang delete aijkok; 67286a27549SJunchao Zhang } else { 67386a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 67486a27549SJunchao Zhang } 675cbc6b225SStefano Zampini A->spptr = NULL; 6769566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6779566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6789566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 67957761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 68057761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 68157761e9aSJunchao Zhang #endif 6829566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6848c3ff71bSJunchao Zhang } 6858c3ff71bSJunchao Zhang 6863f3ba80aSJunchao Zhang /*MC 6873f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6883f3ba80aSJunchao Zhang 68915229ffcSPierre Jolivet A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 6903f3ba80aSJunchao Zhang 6912ef1f0ffSBarry Smith Options Database Key: 69211a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6933f3ba80aSJunchao Zhang 6943f3ba80aSJunchao Zhang Level: beginner 6953f3ba80aSJunchao Zhang 6961cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6973f3ba80aSJunchao Zhang M*/ 698d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 699d71ae5a4SJacob Faibussowitsch { 70086a27549SJunchao Zhang PetscFunctionBegin; 7019566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 7029566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 7039566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 7043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 70586a27549SJunchao Zhang } 70686a27549SJunchao Zhang 707076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 708d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 709d71ae5a4SJacob Faibussowitsch { 710076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 711076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 712076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 713076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 714076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 715076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 716a3f881fbSStefano Zampini 717a3f881fbSStefano Zampini PetscFunctionBegin; 718076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 719076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 7204f572ea9SToby Isaac PetscAssertPointer(C, 4); 721076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 722076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 7235f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 7245f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 725076ba34aSJunchao Zhang 7269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 728076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 729076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 730076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 731076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 732076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 733076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 734076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 735076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 736076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 737076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 738076ba34aSJunchao Zhang nnz = a->nz + b->nz; 739076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 740076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 741076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 742076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 743ecd797f4SJunchao Zhang auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0)); 744ecd797f4SJunchao Zhang auto ci = MatRowMapKokkosView("i", ai.extent(0)); 745ecd797f4SJunchao Zhang auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0)); 746076ba34aSJunchao Zhang 747076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7489371c9d4SSatish Balay Kokkos::parallel_for( 749d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 750076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 751076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 752076ba34aSJunchao Zhang 753076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 754076ba34aSJunchao Zhang ci(i) = coffset; 755076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 756076ba34aSJunchao Zhang }); 757076ba34aSJunchao Zhang 758076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 759076ba34aSJunchao Zhang if (k < alen) { 760076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 761076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 762076ba34aSJunchao Zhang } else { 763076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 764076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 765076ba34aSJunchao Zhang } 766076ba34aSJunchao Zhang }); 767076ba34aSJunchao Zhang }); 768ecd797f4SJunchao Zhang PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca)); 7699566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 770076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 771076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 772076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 773076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 774076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 775076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 776076ba34aSJunchao Zhang 7779371c9d4SSatish Balay Kokkos::parallel_for( 778d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 779076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 780076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 781076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 782076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 783076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 784076ba34aSJunchao Zhang }); 785076ba34aSJunchao Zhang }); 7869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 787076ba34aSJunchao Zhang } 7883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 789076ba34aSJunchao Zhang } 790076ba34aSJunchao Zhang 791cc1eb50dSBarry Smith static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(void **pdata) 792d71ae5a4SJacob Faibussowitsch { 793076ba34aSJunchao Zhang PetscFunctionBegin; 794cc1eb50dSBarry Smith delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata); 7953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 796a3f881fbSStefano Zampini } 797a3f881fbSStefano Zampini 798d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 799d71ae5a4SJacob Faibussowitsch { 800a3f881fbSStefano Zampini Mat_Product *product = C->product; 801a3f881fbSStefano Zampini Mat A, B; 802076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 803a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 804a3f881fbSStefano Zampini Mat_SeqAIJ *c; 805cc1eb50dSBarry Smith MatProductCtx_SeqAIJKokkos *pdata; 8060e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 807a3f881fbSStefano Zampini 808a3f881fbSStefano Zampini PetscFunctionBegin; 809a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8105f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 811cc1eb50dSBarry Smith pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data); 812076ba34aSJunchao Zhang 8130e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 8140e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 8150e3ece09SJunchao Zhang // we still do numeric. 8160e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 8170e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 8183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 819076ba34aSJunchao Zhang } 820076ba34aSJunchao Zhang 821076ba34aSJunchao Zhang switch (product->type) { 8229371c9d4SSatish Balay case MATPRODUCT_AB: 8239371c9d4SSatish Balay transA = false; 8249371c9d4SSatish Balay transB = false; 8259371c9d4SSatish Balay break; 8269371c9d4SSatish Balay case MATPRODUCT_AtB: 8279371c9d4SSatish Balay transA = true; 8289371c9d4SSatish Balay transB = false; 8299371c9d4SSatish Balay break; 8309371c9d4SSatish Balay case MATPRODUCT_ABt: 8319371c9d4SSatish Balay transA = false; 8329371c9d4SSatish Balay transB = true; 8339371c9d4SSatish Balay break; 834d71ae5a4SJacob Faibussowitsch default: 835d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 836076ba34aSJunchao Zhang } 837076ba34aSJunchao Zhang 838a3f881fbSStefano Zampini A = product->A; 839a3f881fbSStefano Zampini B = product->B; 8409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 842a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 843a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 844a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 845076ba34aSJunchao Zhang 8465f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 847076ba34aSJunchao Zhang 8480e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8490e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 850076ba34aSJunchao Zhang 851076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 852076ba34aSJunchao Zhang if (transA) { 8539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 854076ba34aSJunchao Zhang transA = false; 855a3f881fbSStefano Zampini } 856a3f881fbSStefano Zampini 857076ba34aSJunchao Zhang if (transB) { 8589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 859076ba34aSJunchao Zhang transB = false; 860076ba34aSJunchao Zhang } 8619566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8620e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8630e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 864866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 865866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 866e944a159SJunchao Zhang #endif 867866eb059SJunchao Zhang 8689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 870a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 871a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8729566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8739566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8749566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 875a3f881fbSStefano Zampini c->reallocs = 0; 876076ba34aSJunchao Zhang C->info.mallocs = 0; 877a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 878a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 879a3f881fbSStefano Zampini C->num_ass++; 8803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 881a3f881fbSStefano Zampini } 882a3f881fbSStefano Zampini 883d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 884d71ae5a4SJacob Faibussowitsch { 885076ba34aSJunchao Zhang Mat_Product *product = C->product; 886076ba34aSJunchao Zhang MatProductType ptype; 887076ba34aSJunchao Zhang Mat A, B; 888076ba34aSJunchao Zhang bool transA, transB; 889076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 890cc1eb50dSBarry Smith MatProductCtx_SeqAIJKokkos *pdata; 891076ba34aSJunchao Zhang MPI_Comm comm; 8920e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 893a3f881fbSStefano Zampini 894a3f881fbSStefano Zampini PetscFunctionBegin; 895a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8969566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8975f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 898a3f881fbSStefano Zampini A = product->A; 899a3f881fbSStefano Zampini B = product->B; 9009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 9019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 902a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 903a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 9040e3ece09SJunchao Zhang csrmatA = akok->csrmat; 9050e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 906076ba34aSJunchao Zhang 907a3f881fbSStefano Zampini ptype = product->type; 9080e3ece09SJunchao Zhang // Take advantage of the symmetry if true 9090e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 9100e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9110e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 9120e3ece09SJunchao Zhang } 9130e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 9140e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9150e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 9160e3ece09SJunchao Zhang } 9170e3ece09SJunchao Zhang 918a3f881fbSStefano Zampini switch (ptype) { 9199371c9d4SSatish Balay case MATPRODUCT_AB: 9209371c9d4SSatish Balay transA = false; 9219371c9d4SSatish Balay transB = false; 9220e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 9239371c9d4SSatish Balay break; 9249371c9d4SSatish Balay case MATPRODUCT_AtB: 9259371c9d4SSatish Balay transA = true; 9269371c9d4SSatish Balay transB = false; 9270e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 9280e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 9299371c9d4SSatish Balay break; 9309371c9d4SSatish Balay case MATPRODUCT_ABt: 9319371c9d4SSatish Balay transA = false; 9329371c9d4SSatish Balay transB = true; 9330e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 9340e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 9359371c9d4SSatish Balay break; 936d71ae5a4SJacob Faibussowitsch default: 937d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 938a3f881fbSStefano Zampini } 939cc1eb50dSBarry Smith PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos()); 940076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 941a3f881fbSStefano Zampini 942076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 943866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 944866eb059SJunchao Zhang 945866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 946866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 947866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 948866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 949866eb059SJunchao Zhang #endif 950866eb059SJunchao Zhang #endif 9510e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 952076ba34aSJunchao Zhang 9539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 954076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 955076ba34aSJunchao Zhang if (transA) { 9569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 957076ba34aSJunchao Zhang transA = false; 958076ba34aSJunchao Zhang } 959076ba34aSJunchao Zhang 960076ba34aSJunchao Zhang if (transB) { 9619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 962076ba34aSJunchao Zhang transB = false; 963076ba34aSJunchao Zhang } 964076ba34aSJunchao Zhang 9650e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 966076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 967076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 968076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 969076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 970076ba34aSJunchao Zhang */ 9710e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9720e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 973866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 974866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 975866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 976e944a159SJunchao Zhang #endif 9779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 978076ba34aSJunchao Zhang 9799566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9809566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 981cc1eb50dSBarry Smith C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos; 9823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 983a3f881fbSStefano Zampini } 984a3f881fbSStefano Zampini 985a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 986d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 987d71ae5a4SJacob Faibussowitsch { 988076ba34aSJunchao Zhang Mat_Product *product = mat->product; 989a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 990a3f881fbSStefano Zampini 991a3f881fbSStefano Zampini PetscFunctionBegin; 992a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9939566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 99448a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 995a3f881fbSStefano Zampini if (Biskok && Ciskok) { 996a3f881fbSStefano Zampini switch (product->type) { 997a3f881fbSStefano Zampini case MATPRODUCT_AB: 998a3f881fbSStefano Zampini case MATPRODUCT_AtB: 999d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 1000d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 1001d71ae5a4SJacob Faibussowitsch break; 1002a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 1003a3f881fbSStefano Zampini case MATPRODUCT_RARt: 1004d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 1005d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 1006d71ae5a4SJacob Faibussowitsch break; 1007d71ae5a4SJacob Faibussowitsch default: 1008d71ae5a4SJacob Faibussowitsch break; 1009a3f881fbSStefano Zampini } 1010a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 10119566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 1012a3f881fbSStefano Zampini } 10133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1014a3f881fbSStefano Zampini } 1015a587d139SMark 1016d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1017d71ae5a4SJacob Faibussowitsch { 1018f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 1019f0cf5187SStefano Zampini 1020f0cf5187SStefano Zampini PetscFunctionBegin; 10219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 10229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1023f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1024d326c3f1SJunchao Zhang KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 10259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 10279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1029f0cf5187SStefano Zampini } 1030f0cf5187SStefano Zampini 1031f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1032f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1033f4747e26SJunchao Zhang { 1034f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1035f4747e26SJunchao Zhang 1036f4747e26SJunchao Zhang PetscFunctionBegin; 1037*07425a8dSBarry Smith if (A->assembled && aijseq->diagDense) { // no missing diagonals 1038f4747e26SJunchao Zhang PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1039f4747e26SJunchao Zhang 1040f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1041f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1042f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1043f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1044f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1045d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1046f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1047f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1048f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1049f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1050f4747e26SJunchao Zhang PetscCall(MatShift_Basic(A, a)); 1051f4747e26SJunchao Zhang } 1052f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1053f4747e26SJunchao Zhang } 1054f4747e26SJunchao Zhang 1055f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1056f4747e26SJunchao Zhang { 1057f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1058f4747e26SJunchao Zhang 1059f4747e26SJunchao Zhang PetscFunctionBegin; 1060*07425a8dSBarry Smith if (Y->assembled && aijseq->diagDense) { // no missing diagonals 1061f4747e26SJunchao Zhang ConstPetscScalarKokkosView dv; 1062f4747e26SJunchao Zhang PetscInt n, nv; 1063f4747e26SJunchao Zhang 1064f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1065f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1066f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(D, &dv)); 1067f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(D, &nv)); 1068f4747e26SJunchao Zhang n = PetscMin(Y->rmap->n, Y->cmap->n); 1069f4747e26SJunchao Zhang PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1070f4747e26SJunchao Zhang 1071f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1072f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1073f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1074f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1075d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1076f4747e26SJunchao Zhang if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1077f4747e26SJunchao Zhang else Aa(Adiag(i)) += dv(i); 1078f4747e26SJunchao Zhang })); 1079f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(D, &dv)); 1080f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1081f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1082f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1083f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1084f4747e26SJunchao Zhang PetscCall(MatDiagonalSet_Default(Y, D, is)); 1085f4747e26SJunchao Zhang } 1086f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1087f4747e26SJunchao Zhang } 1088f4747e26SJunchao Zhang 1089f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1090f4747e26SJunchao Zhang { 1091f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1092f4747e26SJunchao Zhang PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1093f4747e26SJunchao Zhang ConstPetscScalarKokkosView lv, rv; 1094f4747e26SJunchao Zhang 1095f4747e26SJunchao Zhang PetscFunctionBegin; 1096f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1097f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1098f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1099f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1100f4747e26SJunchao Zhang const auto &Ai = aijkok->i_dual.view_device(); 1101f4747e26SJunchao Zhang const auto &Aj = aijkok->j_dual.view_device(); 1102f4747e26SJunchao Zhang if (ll) { 1103f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(ll, &m)); 1104f4747e26SJunchao Zhang PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1105f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(ll, &lv)); 1106f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each row 1107d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1108f4747e26SJunchao Zhang PetscInt i = t.league_rank(); // row i 1109f4747e26SJunchao Zhang PetscInt len = Ai(i + 1) - Ai(i); 1110f4747e26SJunchao Zhang // scale entries on the row 1111f4747e26SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1112f4747e26SJunchao Zhang })); 1113f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(ll, &lv)); 1114f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1115f4747e26SJunchao Zhang } 1116f4747e26SJunchao Zhang if (rr) { 1117f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(rr, &n)); 1118f4747e26SJunchao Zhang PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1119f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(rr, &rv)); 1120f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1121d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1122f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(rr, &lv)); 1123f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1124f4747e26SJunchao Zhang } 1125f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1126f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1127f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1128f4747e26SJunchao Zhang } 1129f4747e26SJunchao Zhang 1130d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1131d71ae5a4SJacob Faibussowitsch { 1132076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1133a587d139SMark 1134a587d139SMark PetscFunctionBegin; 1135076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11362328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1137d326c3f1SJunchao Zhang KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 11389566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11392328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 11409566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 11412328674fSJunchao Zhang } 11423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1143a587d139SMark } 1144a587d139SMark 1145d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1146d71ae5a4SJacob Faibussowitsch { 1147f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1148f78ce678SMark Adams PetscInt n; 1149f78ce678SMark Adams PetscScalarKokkosView xv; 1150f78ce678SMark Adams 1151f78ce678SMark Adams PetscFunctionBegin; 1152f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1153f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1154f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1155f78ce678SMark Adams 1156f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1157f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1158f78ce678SMark Adams 1159f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1160f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1161f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1162f78ce678SMark Adams 1163f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 11649371c9d4SSatish Balay Kokkos::parallel_for( 1165d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1166f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1167f78ce678SMark Adams else xv(i) = 0; 1168f78ce678SMark Adams }); 1169f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 11703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1171f78ce678SMark Adams } 1172f78ce678SMark Adams 1173db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1174d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1175d71ae5a4SJacob Faibussowitsch { 1176db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1177db78de30SJunchao Zhang 1178db78de30SJunchao Zhang PetscFunctionBegin; 1179db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11804f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1181db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1183db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1184076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1186db78de30SJunchao Zhang } 1187db78de30SJunchao Zhang 1188d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1189d71ae5a4SJacob Faibussowitsch { 1190db78de30SJunchao Zhang PetscFunctionBegin; 1191db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11924f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1193db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1195db78de30SJunchao Zhang } 1196db78de30SJunchao Zhang 1197d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1198d71ae5a4SJacob Faibussowitsch { 1199db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1200db78de30SJunchao Zhang 1201db78de30SJunchao Zhang PetscFunctionBegin; 1202db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12034f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1204db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1206db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1207076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1209db78de30SJunchao Zhang } 1210db78de30SJunchao Zhang 1211d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1212d71ae5a4SJacob Faibussowitsch { 1213db78de30SJunchao Zhang PetscFunctionBegin; 1214db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12154f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1216db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1219db78de30SJunchao Zhang } 1220db78de30SJunchao Zhang 1221d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1222d71ae5a4SJacob Faibussowitsch { 1223db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1224db78de30SJunchao Zhang 1225db78de30SJunchao Zhang PetscFunctionBegin; 1226db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12274f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1228db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1229db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1230076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1232db78de30SJunchao Zhang } 1233db78de30SJunchao Zhang 1234d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1235d71ae5a4SJacob Faibussowitsch { 1236db78de30SJunchao Zhang PetscFunctionBegin; 1237db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12384f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1239db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1242db78de30SJunchao Zhang } 1243db78de30SJunchao Zhang 1244c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1245c0c276a7Ssdargavi { 1246c0c276a7Ssdargavi Mat_SeqAIJKokkos *akok; 1247c0c276a7Ssdargavi 1248c0c276a7Ssdargavi PetscFunctionBegin; 1249ecd797f4SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d)); 1250c0c276a7Ssdargavi PetscCall(MatCreate(comm, A)); 1251c0c276a7Ssdargavi PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1252c0c276a7Ssdargavi PetscFunctionReturn(PETSC_SUCCESS); 1253c0c276a7Ssdargavi } 1254c0c276a7Ssdargavi 1255c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1256d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1257d71ae5a4SJacob Faibussowitsch { 1258a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1259c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1260c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1261c17cf699SJunchao Zhang MatScalarKokkosView Ya; 12624df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1263a587d139SMark 1264a587d139SMark PetscFunctionBegin; 1265c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1266c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 12679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 12689566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 12699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1270db78de30SJunchao Zhang 1271c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1272a587d139SMark PetscBool e; 12739566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1274a587d139SMark if (e) { 12759566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1276c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1277a587d139SMark } 1278a587d139SMark } 1279db78de30SJunchao Zhang 1280c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1281c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1282c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1283c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1284c17cf699SJunchao Zhang */ 1285c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1286c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1287c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1288c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1289c17cf699SJunchao Zhang 1290c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1291d326c3f1SJunchao Zhang KokkosBlas::axpy(exec, alpha, Xa, Ya); 12929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1293c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1294c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1295c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1296c17cf699SJunchao Zhang 12979371c9d4SSatish Balay Kokkos::parallel_for( 1298d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 12990e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 13000e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 13010e3ece09SJunchao Zhang // Only one thread works in a team 1302c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 13030e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 13040e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 13050e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1306c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1307c17cf699SJunchao Zhang q++; 1308a587d139SMark } else { 13090e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 13100e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 13110e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 13120e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 13138b8b16f9SJunchao Zhang #else 13140e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 13158b8b16f9SJunchao Zhang #endif 1316a587d139SMark } 1317c17cf699SJunchao Zhang } 1318c17cf699SJunchao Zhang }); 1319c17cf699SJunchao Zhang }); 13209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 13210e3ece09SJunchao Zhang } else { // different nonzero patterns 1322c17cf699SJunchao Zhang Mat Z; 1323c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1324c17cf699SJunchao Zhang KernelHandle kh; 13250e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1326c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1327c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1328c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 13299566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 13309566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1331c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1332c17cf699SJunchao Zhang } 13339566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13340e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 13353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1336a587d139SMark } 1337a587d139SMark 13382c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 13392c4ab24aSJunchao Zhang PetscCount n; 13402c4ab24aSJunchao Zhang PetscCount Atot; 13412c4ab24aSJunchao Zhang PetscInt nz; 13422c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 13432c4ab24aSJunchao Zhang PetscCountKokkosView perm; 13442c4ab24aSJunchao Zhang 13452c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 13462c4ab24aSJunchao Zhang { 13472c4ab24aSJunchao Zhang nz = coo_h->nz; 13482c4ab24aSJunchao Zhang n = coo_h->n; 13492c4ab24aSJunchao Zhang Atot = coo_h->Atot; 13502c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 13512c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 13522c4ab24aSJunchao Zhang } 13532c4ab24aSJunchao Zhang }; 13542c4ab24aSJunchao Zhang 135549abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 13562c4ab24aSJunchao Zhang { 13572c4ab24aSJunchao Zhang PetscFunctionBegin; 135849abdd8aSBarry Smith PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 13592c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13602c4ab24aSJunchao Zhang } 13612c4ab24aSJunchao Zhang 1362d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1363d71ae5a4SJacob Faibussowitsch { 136442550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 136542550becSJunchao Zhang Mat_SeqAIJ *aseq; 136603e76207SPierre Jolivet PetscContainer container_h; 13672c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 13682c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 136942550becSJunchao Zhang 137042550becSJunchao Zhang PetscFunctionBegin; 13719566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1372394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 137342550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1374cbc6b225SStefano Zampini delete akok; 1375f4747e26SJunchao Zhang mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 13769566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 13772c4ab24aSJunchao Zhang 13782c4ab24aSJunchao Zhang // Copy the COO struct to device 13792c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 13802c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 13812c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 13822c4ab24aSJunchao Zhang 13832c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 138403e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 13853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138642550becSJunchao Zhang } 138742550becSJunchao Zhang 1388d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1389d71ae5a4SJacob Faibussowitsch { 139042550becSJunchao Zhang MatScalarKokkosView Aa; 139142550becSJunchao Zhang ConstMatScalarKokkosView kv; 139242550becSJunchao Zhang PetscMemType memtype; 13932c4ab24aSJunchao Zhang PetscContainer container; 13942c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 139542550becSJunchao Zhang 139642550becSJunchao Zhang PetscFunctionBegin; 13972c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 13982c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 13992c4ab24aSJunchao Zhang 14002c4ab24aSJunchao Zhang const auto &n = coo->n; 14012c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 14022c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 14032c4ab24aSJunchao Zhang const auto &perm = coo->perm; 14042c4ab24aSJunchao Zhang 14059566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 140642550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 14072c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 140842550becSJunchao Zhang } else { 14092c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 141042550becSJunchao Zhang } 141142550becSJunchao Zhang 1412c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1413c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 141442550becSJunchao Zhang 141508bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 14169371c9d4SSatish Balay Kokkos::parallel_for( 1417d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1418c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1419c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1420c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1421c7b718f4SJunchao Zhang }); 142208bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1423394ed5ebSJunchao Zhang 14249566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 14259566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 14263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 142742550becSJunchao Zhang } 142842550becSJunchao Zhang 1429d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1430d71ae5a4SJacob Faibussowitsch { 1431076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1432076ba34aSJunchao Zhang 14338c3ff71bSJunchao Zhang PetscFunctionBegin; 1434076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 14356f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 14366f3d89d0SStefano Zampini 14378c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 14388c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 14398c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1440a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1441f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1442a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1443076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 14448c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 14458c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 14468c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 14478c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 14488c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 14498c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1450076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 14510ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1452152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1453f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1454f4747e26SJunchao Zhang A->ops->shift = MatShift_SeqAIJKokkos; 1455f4747e26SJunchao Zhang A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1456f4747e26SJunchao Zhang A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 145703db1824SAlex Lindsay A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos; 1458076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1459076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1460076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1461076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1462076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1463076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 14647ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 146542550becSJunchao Zhang 14669566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 14679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 146857761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 146957761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 147057761e9aSJunchao Zhang #endif 14713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1472076ba34aSJunchao Zhang } 1473076ba34aSJunchao Zhang 14749d13fa56SJunchao Zhang /* 14759d13fa56SJunchao Zhang Extract the (prescribled) diagonal blocks of the matrix and then invert them 14769d13fa56SJunchao Zhang 14779d13fa56SJunchao Zhang Input Parameters: 14789d13fa56SJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 14799d13fa56SJunchao Zhang . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 14809d13fa56SJunchao Zhang . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 14819d13fa56SJunchao Zhang . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 14829d13fa56SJunchao Zhang - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 14839d13fa56SJunchao Zhang 14849d13fa56SJunchao Zhang Output Parameter: 14859d13fa56SJunchao Zhang . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 14869d13fa56SJunchao Zhang */ 14879d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 14889d13fa56SJunchao Zhang { 14899d13fa56SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 14909d13fa56SJunchao Zhang PetscInt nblocks = bs.extent(0) - 1; 14919d13fa56SJunchao Zhang 14929d13fa56SJunchao Zhang PetscFunctionBegin; 14939d13fa56SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 14949d13fa56SJunchao Zhang 14959d13fa56SJunchao Zhang // Pull out the diagonal blocks of the matrix and then invert the blocks 14969d13fa56SJunchao Zhang auto Aa = akok->a_dual.view_device(); 14979d13fa56SJunchao Zhang auto Ai = akok->i_dual.view_device(); 14989d13fa56SJunchao Zhang auto Aj = akok->j_dual.view_device(); 14999d13fa56SJunchao Zhang auto Adiag = akok->diag_dual.view_device(); 15009d13fa56SJunchao Zhang // TODO: how to tune the team size? 150145402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 15029d13fa56SJunchao Zhang auto ts = Kokkos::AUTO(); 15039d13fa56SJunchao Zhang #else 15049d13fa56SJunchao Zhang auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 15059d13fa56SJunchao Zhang #endif 15069d13fa56SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1507d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 15089d13fa56SJunchao Zhang const PetscInt bid = teamMember.league_rank(); // block id 15099d13fa56SJunchao Zhang const PetscInt rstart = bs(bid); // this block starts from this row 15109d13fa56SJunchao Zhang const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 15119d13fa56SJunchao Zhang const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 15129d13fa56SJunchao Zhang const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 15139d13fa56SJunchao Zhang 15149d13fa56SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 15159d13fa56SJunchao Zhang PetscInt i = rstart + r; // i-th row in A 15169d13fa56SJunchao Zhang 15179d13fa56SJunchao Zhang if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 15189d13fa56SJunchao Zhang PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 15199d13fa56SJunchao Zhang 15209d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 15219d13fa56SJunchao Zhang if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 15229d13fa56SJunchao Zhang B(r, c) = 0.0; 15239d13fa56SJunchao Zhang } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 15249d13fa56SJunchao Zhang B(r, c) = Aa(first + c); 15259d13fa56SJunchao Zhang } else { // this entry does not show up in the CSR 15269d13fa56SJunchao Zhang B(r, c) = 0.0; 15279d13fa56SJunchao Zhang } 15289d13fa56SJunchao Zhang } 15299d13fa56SJunchao Zhang } else { // rare case that the diagonal does not exist 15309d13fa56SJunchao Zhang const PetscInt begin = Ai(i); 15319d13fa56SJunchao Zhang const PetscInt end = Ai(i + 1); 15329d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 15339d13fa56SJunchao Zhang for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 15349d13fa56SJunchao Zhang if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 15359d13fa56SJunchao Zhang else if (Aj(j) >= rstart + m) break; 15369d13fa56SJunchao Zhang } 15379d13fa56SJunchao Zhang } 15389d13fa56SJunchao Zhang }); 15399d13fa56SJunchao Zhang 15409d13fa56SJunchao Zhang // LU-decompose B (w/o pivoting) and then invert B 15419d13fa56SJunchao Zhang KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 15429d13fa56SJunchao Zhang KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 15439d13fa56SJunchao Zhang })); 15449d13fa56SJunchao Zhang // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 15459d13fa56SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15469d13fa56SJunchao Zhang } 15479d13fa56SJunchao Zhang 1548d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1549d71ae5a4SJacob Faibussowitsch { 1550076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1551076ba34aSJunchao Zhang PetscInt i, m, n; 15524df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1553076ba34aSJunchao Zhang 1554076ba34aSJunchao Zhang PetscFunctionBegin; 15555f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1556076ba34aSJunchao Zhang 1557076ba34aSJunchao Zhang m = akok->nrows(); 1558076ba34aSJunchao Zhang n = akok->ncols(); 15599566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 15609566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1561076ba34aSJunchao Zhang 1562076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 15639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 156457508eceSPierre Jolivet aseq = (Mat_SeqAIJ *)A->data; 1565076ba34aSJunchao Zhang 1566f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */ 1567f3d3cd90SZach Atkins PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec)); 1568076ba34aSJunchao Zhang 1569076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1570076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1571076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1572076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1573076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1574076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1575076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1576076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1577076ba34aSJunchao Zhang 15789566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 15799566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1580ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1581076ba34aSJunchao Zhang 1582076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1583076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1584ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 15859566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 15869566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 15873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1588076ba34aSJunchao Zhang } 1589076ba34aSJunchao Zhang 15900e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 15910e3ece09SJunchao Zhang { 15920e3ece09SJunchao Zhang PetscFunctionBegin; 15930e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 15940e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 15950e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15960e3ece09SJunchao Zhang } 15970e3ece09SJunchao Zhang 15980e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 15990e3ece09SJunchao Zhang { 16000e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 16014d86920dSPierre Jolivet 16020e3ece09SJunchao Zhang PetscFunctionBegin; 16030e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 16040e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 16050e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16060e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 16070e3ece09SJunchao Zhang } 16080e3ece09SJunchao Zhang 1609076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1610076ba34aSJunchao Zhang 1611076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1612076ba34aSJunchao Zhang */ 1613d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1614d71ae5a4SJacob Faibussowitsch { 1615076ba34aSJunchao Zhang PetscFunctionBegin; 16169566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16179566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16198c3ff71bSJunchao Zhang } 16208c3ff71bSJunchao Zhang 1621152b3e56SJunchao Zhang /*@C 162211a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 16238c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 162420f4b53cSBarry Smith Kokkos for calculations. 16258c3ff71bSJunchao Zhang 16268c3ff71bSJunchao Zhang Collective 16278c3ff71bSJunchao Zhang 16288c3ff71bSJunchao Zhang Input Parameters: 162911a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 16308c3ff71bSJunchao Zhang . m - number of rows 16318c3ff71bSJunchao Zhang . n - number of columns 163220f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 163320f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 16348c3ff71bSJunchao Zhang 16358c3ff71bSJunchao Zhang Output Parameter: 16368c3ff71bSJunchao Zhang . A - the matrix 16378c3ff71bSJunchao Zhang 16382ef1f0ffSBarry Smith Level: intermediate 16392ef1f0ffSBarry Smith 16402ef1f0ffSBarry Smith Notes: 164111a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 16428c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 164311a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 16448c3ff71bSJunchao Zhang 164511a5261eSBarry Smith The AIJ format, also called 16462ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 16478c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 164820f4b53cSBarry Smith either one (as in Fortran) or zero. 16498c3ff71bSJunchao Zhang 16502ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 16512ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 16522ef1f0ffSBarry Smith allocation. 16538c3ff71bSJunchao Zhang 1654fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 16558c3ff71bSJunchao Zhang @*/ 1656d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1657d71ae5a4SJacob Faibussowitsch { 16588c3ff71bSJunchao Zhang PetscFunctionBegin; 16599566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 16609566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16619566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 16629566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 16639566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 16643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16658c3ff71bSJunchao Zhang } 1666930e68a5SMark Adams 1667aac854edSJunchao Zhang // After matrix numeric factorization, there are still steps to do before triangular solve can be called. 1668aac854edSJunchao Zhang // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does). 1669aac854edSJunchao Zhang // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve(). 1670aac854edSJunchao Zhang // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck. 1671aac854edSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A) 1672d71ae5a4SJacob Faibussowitsch { 167386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1674aac854edSJunchao Zhang const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1675aac854edSJunchao Zhang const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy 167686a27549SJunchao Zhang 167786a27549SJunchao Zhang PetscFunctionBegin; 1678aac854edSJunchao Zhang if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet 1679aac854edSJunchao Zhang if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d)); 1680aac854edSJunchao Zhang if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d)); 168186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 168286a27549SJunchao Zhang } 16833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168486a27549SJunchao Zhang } 168586a27549SJunchao Zhang 1686d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1687d71ae5a4SJacob Faibussowitsch { 1688aac854edSJunchao Zhang const PetscInt n = A->rmap->n; 168986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1690aac854edSJunchao Zhang const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy 1691aac854edSJunchao Zhang const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy 169286a27549SJunchao Zhang 169386a27549SJunchao Zhang PetscFunctionBegin; 1694aac854edSJunchao Zhang if (!factors->transpose_updated) { 1695aac854edSJunchao Zhang if (has_upper) { 1696aac854edSJunchao Zhang if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet 1697aac854edSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 16987b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 16997b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 1700aac854edSJunchao Zhang } 170186a27549SJunchao Zhang 1702aac854edSJunchao Zhang if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host 1703aac854edSJunchao Zhang if (!factors->U) { 1704aac854edSJunchao Zhang Mat_SeqAIJ *seq; 170586a27549SJunchao Zhang 1706aac854edSJunchao Zhang PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U)); 1707aac854edSJunchao Zhang PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut)); 170886a27549SJunchao Zhang 1709aac854edSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data); 1710aac854edSJunchao Zhang factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1711aac854edSJunchao Zhang factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1712aac854edSJunchao Zhang factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1713aac854edSJunchao Zhang } else { 1714aac854edSJunchao Zhang PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h 1715aac854edSJunchao Zhang } 1716aac854edSJunchao Zhang // Copy Ut from host to device 1717aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h)); 1718aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h)); 1719aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h)); 1720aac854edSJunchao Zhang } else { // If U was computed on device, we also compute the transpose there 1721aac854edSJunchao Zhang // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1722aac854edSJunchao Zhang PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, 1723aac854edSJunchao Zhang factors->jU_d, factors->aU_d, 1724aac854edSJunchao Zhang factors->iUt_d, factors->jUt_d, 1725aac854edSJunchao Zhang factors->aUt_d)); 1726aac854edSJunchao Zhang PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1727aac854edSJunchao Zhang } 1728aac854edSJunchao Zhang PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d)); 1729aac854edSJunchao Zhang } 1730aac854edSJunchao Zhang 1731aac854edSJunchao Zhang // do the same for L with LU 1732aac854edSJunchao Zhang if (has_lower) { 1733aac854edSJunchao Zhang if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet 1734aac854edSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix 1735aac854edSJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1736aac854edSJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 1737aac854edSJunchao Zhang } 1738aac854edSJunchao Zhang 1739aac854edSJunchao Zhang if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host 1740aac854edSJunchao Zhang if (!factors->L) { 1741aac854edSJunchao Zhang Mat_SeqAIJ *seq; 1742aac854edSJunchao Zhang 1743aac854edSJunchao Zhang PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L)); 1744aac854edSJunchao Zhang PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt)); 1745aac854edSJunchao Zhang 1746aac854edSJunchao Zhang seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data); 1747aac854edSJunchao Zhang factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1); 1748aac854edSJunchao Zhang factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz); 1749aac854edSJunchao Zhang factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz); 1750aac854edSJunchao Zhang } else { 1751aac854edSJunchao Zhang PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h 1752aac854edSJunchao Zhang } 1753aac854edSJunchao Zhang // Copy Lt from host to device 1754aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h)); 1755aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h)); 1756aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h)); 1757aac854edSJunchao Zhang } else { // If L was computed on device, we also compute the transpose there 1758aac854edSJunchao Zhang // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options. 1759aac854edSJunchao Zhang PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, 1760aac854edSJunchao Zhang factors->jL_d, factors->aL_d, 1761aac854edSJunchao Zhang factors->iLt_d, factors->jLt_d, 1762aac854edSJunchao Zhang factors->aLt_d)); 1763aac854edSJunchao Zhang PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1764aac854edSJunchao Zhang } 1765aac854edSJunchao Zhang PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d)); 1766aac854edSJunchao Zhang } 1767aac854edSJunchao Zhang 176886a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 176986a27549SJunchao Zhang } 17703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177186a27549SJunchao Zhang } 177286a27549SJunchao Zhang 1773aac854edSJunchao Zhang // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A. 1774aac854edSJunchao Zhang // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty. 1775aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx) 1776d71ae5a4SJacob Faibussowitsch { 1777aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 177886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1779aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1780aac854edSJunchao Zhang PetscScalarKokkosView D = factors->D_d; 1781aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1782aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1783aac854edSJunchao Zhang PetscScalarKokkosView x; 1784aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1785aac854edSJunchao Zhang PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 178686a27549SJunchao Zhang 178786a27549SJunchao Zhang PetscFunctionBegin; 17889566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1789aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T 1790aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B 1791aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1792aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 1793aac854edSJunchao Zhang 1794aac854edSJunchao Zhang // Solve U^T Y = B 1795aac854edSJunchao Zhang if (identity) { // Reorder b with the row permutation 1796aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1797aac854edSJunchao Zhang Y = factors->workVector; 1798aac854edSJunchao Zhang } else { 1799aac854edSJunchao Zhang B = factors->workVector; 1800aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1801aac854edSJunchao Zhang Y = x; 1802aac854edSJunchao Zhang } 1803aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1804aac854edSJunchao Zhang 1805aac854edSJunchao Zhang // Solve diag(D) Y' = Y. 1806aac854edSJunchao Zhang // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication. 1807aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); })); 1808aac854edSJunchao Zhang 1809aac854edSJunchao Zhang // Solve UX = Y 1810aac854edSJunchao Zhang if (identity) { 1811aac854edSJunchao Zhang X = x; 1812aac854edSJunchao Zhang } else { 1813aac854edSJunchao Zhang X = factors->workVector; // B is not needed anymore 1814aac854edSJunchao Zhang } 1815aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1816aac854edSJunchao Zhang 1817aac854edSJunchao Zhang // Reorder X with the inverse column (row) permutation 18183a7d0413SPierre Jolivet if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1819aac854edSJunchao Zhang 1820aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1821aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 18229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 18233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 182486a27549SJunchao Zhang } 182586a27549SJunchao Zhang 1826aac854edSJunchao Zhang // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1827aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors. 1828aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1829aac854edSJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1830d71ae5a4SJacob Faibussowitsch { 1831aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 183286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1833aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1834aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1835aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1836aac854edSJunchao Zhang PetscScalarKokkosView x; 1837aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1838aac854edSJunchao Zhang PetscIntKokkosView &colperm = factors->colperm; 1839aac854edSJunchao Zhang PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1840aac854edSJunchao Zhang PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 184186a27549SJunchao Zhang 184286a27549SJunchao Zhang PetscFunctionBegin; 18439566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1844aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSolveCheck(A)); 1845aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1846aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 184786a27549SJunchao Zhang 1848aac854edSJunchao Zhang // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b. 1849aac854edSJunchao Zhang if (row_identity) { 1850aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1851aac854edSJunchao Zhang Y = factors->workVector; 1852aac854edSJunchao Zhang } else { 1853aac854edSJunchao Zhang B = factors->workVector; 1854aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); })); 1855aac854edSJunchao Zhang Y = x; 1856aac854edSJunchao Zhang } 1857aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y)); 1858aac854edSJunchao Zhang 1859aac854edSJunchao Zhang // Solve U C^- x = Y 1860aac854edSJunchao Zhang if (col_identity) { 1861aac854edSJunchao Zhang X = x; 1862aac854edSJunchao Zhang } else { 1863aac854edSJunchao Zhang X = factors->workVector; 1864aac854edSJunchao Zhang } 1865aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X)); 1866aac854edSJunchao Zhang 1867aac854edSJunchao Zhang // x = C X; Reorder X with the inverse col permutation 18683a7d0413SPierre Jolivet if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); })); 1869aac854edSJunchao Zhang 1870aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1871aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 18729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 18733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 187486a27549SJunchao Zhang } 187586a27549SJunchao Zhang 1876aac854edSJunchao Zhang // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively. 1877aac854edSJunchao Zhang // R and C are represented by rowperm and colperm in factors. 1878aac854edSJunchao Zhang // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty. 1879aac854edSJunchao Zhang // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R. 1880aac854edSJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx) 1881aac854edSJunchao Zhang { 1882aac854edSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1883aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1884aac854edSJunchao Zhang PetscInt m = A->rmap->n; 1885aac854edSJunchao Zhang PetscScalarKokkosView X, Y, B; // alias 1886aac854edSJunchao Zhang ConstPetscScalarKokkosView b; 1887aac854edSJunchao Zhang PetscScalarKokkosView x; 1888aac854edSJunchao Zhang PetscIntKokkosView &rowperm = factors->rowperm; 1889aac854edSJunchao Zhang PetscIntKokkosView &colperm = factors->colperm; 1890aac854edSJunchao Zhang PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1891aac854edSJunchao Zhang PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE; 1892aac854edSJunchao Zhang 1893aac854edSJunchao Zhang PetscFunctionBegin; 1894aac854edSJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1895aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T 1896aac854edSJunchao Zhang PetscCall(VecGetKokkosView(bb, &b)); 1897aac854edSJunchao Zhang PetscCall(VecGetKokkosViewWrite(xx, &x)); 1898aac854edSJunchao Zhang 1899aac854edSJunchao Zhang // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b. 1900aac854edSJunchao Zhang if (col_identity) { // Reorder b with the col permutation 1901aac854edSJunchao Zhang B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0)); 1902aac854edSJunchao Zhang Y = factors->workVector; 1903aac854edSJunchao Zhang } else { 1904aac854edSJunchao Zhang B = factors->workVector; 1905aac854edSJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); })); 1906aac854edSJunchao Zhang Y = x; 1907aac854edSJunchao Zhang } 1908aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y)); 1909aac854edSJunchao Zhang 1910aac854edSJunchao Zhang // Solve L^T X = Y 1911aac854edSJunchao Zhang if (row_identity) { 1912aac854edSJunchao Zhang X = x; 1913aac854edSJunchao Zhang } else { 1914aac854edSJunchao Zhang X = factors->workVector; 1915aac854edSJunchao Zhang } 1916aac854edSJunchao Zhang PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X)); 1917aac854edSJunchao Zhang 1918aac854edSJunchao Zhang // x = R^- X = R^T X; Reorder X with the inverse row permutation 19193a7d0413SPierre Jolivet if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); })); 1920aac854edSJunchao Zhang 1921aac854edSJunchao Zhang PetscCall(VecRestoreKokkosView(bb, &b)); 1922aac854edSJunchao Zhang PetscCall(VecRestoreKokkosViewWrite(xx, &x)); 1923aac854edSJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1924aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1925aac854edSJunchao Zhang } 1926aac854edSJunchao Zhang 1927aac854edSJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1928aac854edSJunchao Zhang { 1929aac854edSJunchao Zhang PetscFunctionBegin; 1930aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSyncHost(A)); 1931aac854edSJunchao Zhang PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 1932aac854edSJunchao Zhang 1933aac854edSJunchao Zhang if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 1934aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 1935aac854edSJunchao Zhang Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 1936aac854edSJunchao Zhang const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 1937aac854edSJunchao Zhang const MatScalar *Ba = b->a; 1938aac854edSJunchao Zhang PetscInt m = B->rmap->n, n = B->cmap->n; 1939aac854edSJunchao Zhang 1940aac854edSJunchao Zhang if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time 1941aac854edSJunchao Zhang // Allocate memory and copy the structure 1942aac854edSJunchao Zhang factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1); 1943aac854edSJunchao Zhang factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries 1944aac854edSJunchao Zhang factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m); 1945aac854edSJunchao Zhang factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1); 1946aac854edSJunchao Zhang factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m])); 1947aac854edSJunchao Zhang factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m])); 1948aac854edSJunchao Zhang 1949aac854edSJunchao Zhang PetscInt *Li = factors->iL_h.data(); 1950aac854edSJunchao Zhang PetscInt *Lj = factors->jL_h.data(); 1951aac854edSJunchao Zhang PetscInt *Ui = factors->iU_h.data(); 1952aac854edSJunchao Zhang PetscInt *Uj = factors->jU_h.data(); 1953aac854edSJunchao Zhang 1954aac854edSJunchao Zhang Li[0] = Ui[0] = 0; 1955aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 1956aac854edSJunchao Zhang PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry 1957aac854edSJunchao Zhang PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry 1958aac854edSJunchao Zhang 195964dc1d19SNuno Nobre PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal 1960aac854edSJunchao Zhang Lj[Li[i] + llen] = i; // diagonal entry of L 1961aac854edSJunchao Zhang 1962aac854edSJunchao Zhang Uj[Ui[i]] = i; // diagonal entry of U 196364dc1d19SNuno Nobre PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal 1964aac854edSJunchao Zhang 1965aac854edSJunchao Zhang Li[i + 1] = Li[i] + llen + 1; 1966aac854edSJunchao Zhang Ui[i + 1] = Ui[i] + ulen; 1967aac854edSJunchao Zhang } 1968aac854edSJunchao Zhang 1969aac854edSJunchao Zhang factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h); 1970aac854edSJunchao Zhang factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h); 1971aac854edSJunchao Zhang factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h); 1972aac854edSJunchao Zhang factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h); 1973aac854edSJunchao Zhang factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h); 1974aac854edSJunchao Zhang factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 1975aac854edSJunchao Zhang 1976aac854edSJunchao Zhang // Copy row/col permutation to device 1977aac854edSJunchao Zhang IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 1978aac854edSJunchao Zhang PetscBool row_identity; 1979aac854edSJunchao Zhang PetscCall(ISIdentity(rowperm, &row_identity)); 1980aac854edSJunchao Zhang if (!row_identity) { 1981aac854edSJunchao Zhang const PetscInt *ip; 1982aac854edSJunchao Zhang 1983aac854edSJunchao Zhang PetscCall(ISGetIndices(rowperm, &ip)); 1984aac854edSJunchao Zhang factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m); 1985aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 1986aac854edSJunchao Zhang PetscCall(ISRestoreIndices(rowperm, &ip)); 1987aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 1988aac854edSJunchao Zhang } 1989aac854edSJunchao Zhang 1990aac854edSJunchao Zhang IS colperm = ((Mat_SeqAIJ *)B->data)->col; 1991aac854edSJunchao Zhang PetscBool col_identity; 1992aac854edSJunchao Zhang PetscCall(ISIdentity(colperm, &col_identity)); 1993aac854edSJunchao Zhang if (!col_identity) { 1994aac854edSJunchao Zhang const PetscInt *ip; 1995aac854edSJunchao Zhang 1996aac854edSJunchao Zhang PetscCall(ISGetIndices(colperm, &ip)); 1997aac854edSJunchao Zhang factors->colperm = PetscIntKokkosView(NoInit("colperm"), n); 1998aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n))); 1999aac854edSJunchao Zhang PetscCall(ISRestoreIndices(colperm, &ip)); 2000aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt))); 2001aac854edSJunchao Zhang } 2002aac854edSJunchao Zhang 2003aac854edSJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 2004aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2005aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2006aac854edSJunchao Zhang #else 2007aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2008aac854edSJunchao Zhang #endif 2009aac854edSJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */); 2010aac854edSJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2011aac854edSJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */); 2012aac854edSJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2013aac854edSJunchao Zhang } 2014aac854edSJunchao Zhang 2015aac854edSJunchao Zhang // Copy the value 2016aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2017aac854edSJunchao Zhang PetscInt llen = Bi[i + 1] - Bi[i]; 2018aac854edSJunchao Zhang PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; 2019aac854edSJunchao Zhang const PetscInt *Li = factors->iL_h.data(); 2020aac854edSJunchao Zhang const PetscInt *Ui = factors->iU_h.data(); 2021aac854edSJunchao Zhang 2022aac854edSJunchao Zhang PetscScalar *La = factors->aL_h.data(); 2023aac854edSJunchao Zhang PetscScalar *Ua = factors->aU_h.data(); 2024aac854edSJunchao Zhang 202564dc1d19SNuno Nobre PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L 2026aac854edSJunchao Zhang La[Li[i] + llen] = 1.0; // diagonal entry 2027aac854edSJunchao Zhang 2028aac854edSJunchao Zhang Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry 202964dc1d19SNuno Nobre PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U 2030aac854edSJunchao Zhang } 2031aac854edSJunchao Zhang 2032aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h)); 2033aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2034aac854edSJunchao Zhang // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic 2035aac854edSJunchao Zhang factors->transpose_updated = PETSC_FALSE; 2036aac854edSJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 2037aac854edSJunchao Zhang 2038aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2039aac854edSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 2040aac854edSJunchao Zhang } 2041aac854edSJunchao Zhang 2042aac854edSJunchao Zhang B->ops->matsolve = NULL; 2043aac854edSJunchao Zhang B->ops->matsolvetranspose = NULL; 2044aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2045aac854edSJunchao Zhang } 2046aac854edSJunchao Zhang 2047aac854edSJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info) 2048d71ae5a4SJacob Faibussowitsch { 204986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 205086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 205186a27549SJunchao Zhang PetscInt fill_lev = info->levels; 205286a27549SJunchao Zhang 205386a27549SJunchao Zhang PetscFunctionBegin; 20549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2055aac854edSJunchao Zhang PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false"); 20569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2057076ba34aSJunchao Zhang 2058076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 2059076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 2060076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 2061076ba34aSJunchao Zhang 2062aac854edSJunchao Zhang PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d)); 206386a27549SJunchao Zhang 206486a27549SJunchao Zhang B->assembled = PETSC_TRUE; 206586a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 2066aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_LU; 2067aac854edSJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU; 206886a27549SJunchao Zhang B->ops->matsolve = NULL; 206986a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 207086a27549SJunchao Zhang 207186a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 207286a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 207386a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 2074eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 20759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 20763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 207786a27549SJunchao Zhang } 207886a27549SJunchao Zhang 2079aac854edSJunchao Zhang // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering 2080aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info) 2081d71ae5a4SJacob Faibussowitsch { 208286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 208386a27549SJunchao Zhang Mat_SeqAIJ *b; 208486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 208586a27549SJunchao Zhang PetscInt fill_lev = info->levels; 208686a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 208786a27549SJunchao Zhang PetscInt n = A->rmap->n; 208886a27549SJunchao Zhang 208986a27549SJunchao Zhang PetscFunctionBegin; 2090aac854edSJunchao Zhang PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now"); 20919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 209286a27549SJunchao Zhang 209386a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 209486a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 2095aac854edSJunchao Zhang factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 209686a27549SJunchao Zhang 209786a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 209886a27549SJunchao Zhang 209986a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 210086a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 210186a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 210286a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 210386a27549SJunchao Zhang 210486a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 2105076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 2106076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 2107aac854edSJunchao Zhang PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d)); 210886a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 210986a27549SJunchao Zhang 211086a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 211186a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 211286a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 211386a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 211486a27549SJunchao Zhang 211586a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 211686a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 211786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2118aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 211986a27549SJunchao Zhang #else 2120aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 212186a27549SJunchao Zhang #endif 212286a27549SJunchao Zhang 212386a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 212486a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 212586a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 212686a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 212786a27549SJunchao Zhang 212886a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 21299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 213086a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 213186a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 213286a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 2133a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 213486a27549SJunchao Zhang 2135aac854edSJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0; 21363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2137930e68a5SMark Adams } 2138930e68a5SMark Adams 2139aac854edSJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2140aac854edSJunchao Zhang { 2141aac854edSJunchao Zhang PetscFunctionBegin; 2142aac854edSJunchao Zhang PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 2143aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2144aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2145aac854edSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2146aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2147aac854edSJunchao Zhang } 2148aac854edSJunchao Zhang 2149aac854edSJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 2150aac854edSJunchao Zhang { 2151aac854edSJunchao Zhang PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE; 2152aac854edSJunchao Zhang 2153aac854edSJunchao Zhang PetscFunctionBegin; 2154aac854edSJunchao Zhang if (!info->factoronhost) { 2155aac854edSJunchao Zhang PetscCall(ISIdentity(isrow, &row_identity)); 2156aac854edSJunchao Zhang PetscCall(ISIdentity(iscol, &col_identity)); 2157aac854edSJunchao Zhang } 2158aac854edSJunchao Zhang 2159aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2160aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2161aac854edSJunchao Zhang 2162aac854edSJunchao Zhang if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering 2163aac854edSJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info)); 2164aac854edSJunchao Zhang } else { 2165aac854edSJunchao Zhang PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host 2166aac854edSJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 2167aac854edSJunchao Zhang } 2168aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2169aac854edSJunchao Zhang } 2170aac854edSJunchao Zhang 2171aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 2172aac854edSJunchao Zhang { 2173aac854edSJunchao Zhang PetscFunctionBegin; 2174aac854edSJunchao Zhang PetscCall(MatSeqAIJKokkosSyncHost(A)); 2175aac854edSJunchao Zhang PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info)); 2176aac854edSJunchao Zhang 2177aac854edSJunchao Zhang if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device 2178aac854edSJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 2179aac854edSJunchao Zhang Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data); 2180aac854edSJunchao Zhang const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag; 2181aac854edSJunchao Zhang const MatScalar *Ba = b->a; 2182aac854edSJunchao Zhang PetscInt m = B->rmap->n; 2183aac854edSJunchao Zhang 2184aac854edSJunchao Zhang if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization 2185aac854edSJunchao Zhang // Allocate memory and copy the structure 2186aac854edSJunchao Zhang factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h 2187aac854edSJunchao Zhang factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]); 2188aac854edSJunchao Zhang factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]); 2189aac854edSJunchao Zhang factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m); 2190aac854edSJunchao Zhang factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h); 2191aac854edSJunchao Zhang factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h); 2192aac854edSJunchao Zhang 2193aac854edSJunchao Zhang // Build jU_h from the skewed Aj 2194aac854edSJunchao Zhang PetscInt *Uj = factors->jU_h.data(); 2195aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2196aac854edSJunchao Zhang PetscInt ulen = Bi[i + 1] - Bi[i]; 2197aac854edSJunchao Zhang Uj[Bi[i]] = i; // diagonal entry 2198aac854edSJunchao Zhang PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal 2199aac854edSJunchao Zhang } 2200aac854edSJunchao Zhang 2201aac854edSJunchao Zhang // Copy iU, jU to device 2202aac854edSJunchao Zhang PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h)); 2203aac854edSJunchao Zhang PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h)); 2204aac854edSJunchao Zhang 2205aac854edSJunchao Zhang // Copy row/col permutation to device 2206aac854edSJunchao Zhang IS rowperm = ((Mat_SeqAIJ *)B->data)->row; 2207aac854edSJunchao Zhang PetscBool row_identity; 2208aac854edSJunchao Zhang PetscCall(ISIdentity(rowperm, &row_identity)); 2209aac854edSJunchao Zhang if (!row_identity) { 2210aac854edSJunchao Zhang const PetscInt *ip; 2211aac854edSJunchao Zhang 2212aac854edSJunchao Zhang PetscCall(ISGetIndices(rowperm, &ip)); 2213aac854edSJunchao Zhang PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m)); 2214aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m))); 2215aac854edSJunchao Zhang PetscCall(ISRestoreIndices(rowperm, &ip)); 2216aac854edSJunchao Zhang PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt))); 2217aac854edSJunchao Zhang } 2218aac854edSJunchao Zhang 2219aac854edSJunchao Zhang // Create sptrsv handles for U and U^T 2220aac854edSJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 2221aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE; 2222aac854edSJunchao Zhang #else 2223aac854edSJunchao Zhang auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1; 2224aac854edSJunchao Zhang #endif 2225aac854edSJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */); 2226aac854edSJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */); 2227aac854edSJunchao Zhang } 2228aac854edSJunchao Zhang // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them 2229aac854edSJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky; 2230aac854edSJunchao Zhang B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky; 2231aac854edSJunchao Zhang 2232aac854edSJunchao Zhang // Copy the value 2233aac854edSJunchao Zhang PetscScalar *Ua = factors->aU_h.data(); 2234aac854edSJunchao Zhang PetscScalar *D = factors->D_h.data(); 2235aac854edSJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2236aac854edSJunchao Zhang D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal 2237aac854edSJunchao Zhang Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U 2238aac854edSJunchao Zhang for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k]; 2239aac854edSJunchao Zhang } 2240aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h)); 2241aac854edSJunchao Zhang PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h)); 2242aac854edSJunchao Zhang 2243aac854edSJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again 2244aac854edSJunchao Zhang factors->transpose_updated = PETSC_FALSE; 2245aac854edSJunchao Zhang } 2246aac854edSJunchao Zhang 2247aac854edSJunchao Zhang B->ops->matsolve = NULL; 2248aac854edSJunchao Zhang B->ops->matsolvetranspose = NULL; 2249aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2250aac854edSJunchao Zhang } 2251aac854edSJunchao Zhang 2252aac854edSJunchao Zhang static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2253aac854edSJunchao Zhang { 2254aac854edSJunchao Zhang PetscFunctionBegin; 2255aac854edSJunchao Zhang if (info->solveonhost) { 2256aac854edSJunchao Zhang // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2257aac854edSJunchao Zhang PetscCall(MatSetType(B, MATSEQSBAIJ)); 2258aac854edSJunchao Zhang PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2259aac854edSJunchao Zhang } 2260aac854edSJunchao Zhang 2261aac854edSJunchao Zhang PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info)); 2262aac854edSJunchao Zhang 2263aac854edSJunchao Zhang if (!info->solveonhost) { 2264bfe80ac4SPierre Jolivet // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2265aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2266aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2267aac854edSJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2268aac854edSJunchao Zhang } 2269aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2270aac854edSJunchao Zhang } 2271aac854edSJunchao Zhang 2272aac854edSJunchao Zhang static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2273aac854edSJunchao Zhang { 2274aac854edSJunchao Zhang PetscFunctionBegin; 2275aac854edSJunchao Zhang if (info->solveonhost) { 2276aac854edSJunchao Zhang // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc. 2277aac854edSJunchao Zhang PetscCall(MatSetType(B, MATSEQSBAIJ)); 2278aac854edSJunchao Zhang PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 2279aac854edSJunchao Zhang } 2280aac854edSJunchao Zhang 2281aac854edSJunchao Zhang PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm 2282aac854edSJunchao Zhang 2283aac854edSJunchao Zhang if (!info->solveonhost) { 2284bfe80ac4SPierre Jolivet // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr 2285aac854edSJunchao Zhang PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2286aac854edSJunchao Zhang PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n)); 2287aac854edSJunchao Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos; 2288aac854edSJunchao Zhang } 2289aac854edSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2290aac854edSJunchao Zhang } 2291aac854edSJunchao Zhang 2292aac854edSJunchao Zhang // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix 2293aac854edSJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type) 2294d71ae5a4SJacob Faibussowitsch { 2295930e68a5SMark Adams PetscFunctionBegin; 2296930e68a5SMark Adams *type = MATSOLVERKOKKOS; 22973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2298930e68a5SMark Adams } 2299930e68a5SMark Adams 2300930e68a5SMark Adams /*MC 230186a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 230211a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 2303930e68a5SMark Adams 2304930e68a5SMark Adams Level: beginner 2305930e68a5SMark Adams 2306e255e071SSatish Balay .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS` 2307930e68a5SMark Adams M*/ 230886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 2309930e68a5SMark Adams { 2310930e68a5SMark Adams PetscInt n = A->rmap->n; 2311aac854edSJunchao Zhang MPI_Comm comm; 2312930e68a5SMark Adams 2313930e68a5SMark Adams PetscFunctionBegin; 2314aac854edSJunchao Zhang PetscCall(PetscObjectGetComm((PetscObject)A, &comm)); 2315aac854edSJunchao Zhang PetscCall(MatCreate(comm, B)); 23169566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 2317aac854edSJunchao Zhang PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 2318930e68a5SMark Adams (*B)->factortype = ftype; 23199566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 23209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 2321aac854edSJunchao Zhang PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr"); 2322aac854edSJunchao Zhang 2323aac854edSJunchao Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 2324aac854edSJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 2325aac854edSJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 2326aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 2327aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 2328aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 2329aac854edSJunchao Zhang } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 2330aac854edSJunchao Zhang (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos; 2331aac854edSJunchao Zhang (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos; 2332aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 2333aac854edSJunchao Zhang PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 2334aac854edSJunchao Zhang } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 2335aac854edSJunchao Zhang 2336aac854edSJunchao Zhang // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host 2337aac854edSJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 2338aac854edSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos)); 23393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2340930e68a5SMark Adams } 23418f7e8f9dSMark Adams 2342aac854edSJunchao Zhang PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void) 2343d71ae5a4SJacob Faibussowitsch { 234486a27549SJunchao Zhang PetscFunctionBegin; 23459566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2346aac854edSJunchao Zhang PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos)); 23479566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 2348aac854edSJunchao Zhang PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos)); 23493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 235086a27549SJunchao Zhang } 235186a27549SJunchao Zhang 2352076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 2353d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 2354d71ae5a4SJacob Faibussowitsch { 235545402d8aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 235645402d8aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 235745402d8aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 2358076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 2359076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 2360076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 2361076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2362076ba34aSJunchao Zhang 2363076ba34aSJunchao Zhang PetscFunctionBegin; 23649566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2365076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 23669566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 236748a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 23689566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2369076ba34aSJunchao Zhang } 23703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2371076ba34aSJunchao Zhang } 2372