1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp> 211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 3*c0c276a7Ssdargavi #include <petscmat_kokkos.hpp> 4076ba34aSJunchao Zhang #include <petscpkg_version.h> 5152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 642550becSJunchao Zhang #include <petsc/private/sfimpl.h> 78c3ff71bSJunchao Zhang #include <petscsystypes.h> 88c3ff71bSJunchao Zhang #include <petscerror.h> 98c3ff71bSJunchao Zhang 108c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 11f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 13cc6e31f1SJunchao Zhang 14cc6e31f1SJunchao Zhang // To suppress compiler warnings: 15cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 16cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 17cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 18cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 19cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 208c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 21cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 22cc6e31f1SJunchao Zhang 2386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 2486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 25076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 26076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 279d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp> 289d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp> 2986a27549SJunchao Zhang 3042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 318c3ff71bSJunchao Zhang 320e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 33f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 34f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 359371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 36f98996d3SJunchao Zhang #else 37f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 38f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 399371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 40f98996d3SJunchao Zhang #endif 41f98996d3SJunchao Zhang 428c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 438c3ff71bSJunchao Zhang 44076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 45076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 46076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 47076ba34aSJunchao Zhang */ 48d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 49d71ae5a4SJacob Faibussowitsch { 50076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 51076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 528c3ff71bSJunchao Zhang 538c3ff71bSJunchao Zhang PetscFunctionBegin; 543ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 559566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 56076ba34aSJunchao Zhang 57076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 58076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 59076ba34aSJunchao Zhang 60076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 61076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 62076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 63076ba34aSJunchao Zhang */ 64076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 65076ba34aSJunchao Zhang delete aijkok; 66f4747e26SJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 67076ba34aSJunchao Zhang A->spptr = aijkok; 68f4747e26SJunchao Zhang } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 69f4747e26SJunchao Zhang MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 70f4747e26SJunchao Zhang auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 71f4747e26SJunchao Zhang aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 72076ba34aSJunchao Zhang } 733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748c3ff71bSJunchao Zhang } 758c3ff71bSJunchao Zhang 7686a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 77d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 78d71ae5a4SJacob Faibussowitsch { 798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 808c3ff71bSJunchao Zhang 818c3ff71bSJunchao Zhang PetscFunctionBegin; 82aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 835f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 84076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 85076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 86580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 8786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 888c3ff71bSJunchao Zhang } 893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 908c3ff71bSJunchao Zhang } 918c3ff71bSJunchao Zhang 92076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 93d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 94d71ae5a4SJacob Faibussowitsch { 9586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 9686a27549SJunchao Zhang 9786a27549SJunchao Zhang PetscFunctionBegin; 985f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 9986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 10086a27549SJunchao Zhang aijkok->a_dual.modify_device(); 10186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 10286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 1049566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 1053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10686a27549SJunchao Zhang } 10786a27549SJunchao Zhang 108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 109d71ae5a4SJacob Faibussowitsch { 110f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1114df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 112f0cf5187SStefano Zampini 113f0cf5187SStefano Zampini PetscFunctionBegin; 114f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 116aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1175f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 118e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 119e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 121f0cf5187SStefano Zampini } 122f0cf5187SStefano Zampini 123d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 124d71ae5a4SJacob Faibussowitsch { 125076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 126f0cf5187SStefano Zampini 127f0cf5187SStefano Zampini PetscFunctionBegin; 1285519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1295519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1305519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1315519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1325519a089SJose E. Roman */ 1335519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 1344df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 135e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 136e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 137076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 138076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 139076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 140076ba34aSJunchao Zhang } 1413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 142076ba34aSJunchao Zhang } 143076ba34aSJunchao Zhang 144d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 145d71ae5a4SJacob Faibussowitsch { 146076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 147076ba34aSJunchao Zhang 148076ba34aSJunchao Zhang PetscFunctionBegin; 1495519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 151076ba34aSJunchao Zhang } 152076ba34aSJunchao Zhang 153d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 154d71ae5a4SJacob Faibussowitsch { 155076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 156076ba34aSJunchao Zhang 157076ba34aSJunchao Zhang PetscFunctionBegin; 1585519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 1594df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 160e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 161e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 162076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1632328674fSJunchao Zhang } else { 1642328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1652328674fSJunchao Zhang } 1663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 167076ba34aSJunchao Zhang } 168076ba34aSJunchao Zhang 169d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 170d71ae5a4SJacob Faibussowitsch { 171076ba34aSJunchao Zhang PetscFunctionBegin; 172076ba34aSJunchao Zhang *array = NULL; 1733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 174076ba34aSJunchao Zhang } 175076ba34aSJunchao Zhang 176d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 177d71ae5a4SJacob Faibussowitsch { 178076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 179076ba34aSJunchao Zhang 180076ba34aSJunchao Zhang PetscFunctionBegin; 1815519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 182076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1832328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1842328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1852328674fSJunchao Zhang } 1863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 187076ba34aSJunchao Zhang } 188076ba34aSJunchao Zhang 189d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 190d71ae5a4SJacob Faibussowitsch { 191076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 192076ba34aSJunchao Zhang 193076ba34aSJunchao Zhang PetscFunctionBegin; 1945519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 195076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 196076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1972328674fSJunchao Zhang } 1983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 199f0cf5187SStefano Zampini } 200f0cf5187SStefano Zampini 201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 202d71ae5a4SJacob Faibussowitsch { 2037ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2047ee59b9bSJunchao Zhang 2057ee59b9bSJunchao Zhang PetscFunctionBegin; 2067ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 2077ee59b9bSJunchao Zhang 2087ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 2097ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 2107ee59b9bSJunchao Zhang if (a) { 2117ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 2127ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 2137ee59b9bSJunchao Zhang } 2147ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 2153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2167ee59b9bSJunchao Zhang } 2177ee59b9bSJunchao Zhang 2180e3ece09SJunchao Zhang /* 2190e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2200e3ece09SJunchao Zhang 2210e3ece09SJunchao Zhang Input Parameter: 2220e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2230e3ece09SJunchao Zhang 2240e3ece09SJunchao Zhang Output Parameters: 2250e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 226aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2270e3ece09SJunchao Zhang */ 2280e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 229d71ae5a4SJacob Faibussowitsch { 2300e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2310e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2320e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2337b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2340e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2357b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2367b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2370e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2380e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2390e3ece09SJunchao Zhang PetscInt *offset; 240152b3e56SJunchao Zhang 241152b3e56SJunchao Zhang PetscFunctionBegin; 2420e3ece09SJunchao Zhang // Populate Ti 2430e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2440e3ece09SJunchao Zhang Ti++; 2450e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2460e3ece09SJunchao Zhang Ti--; 2470e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2480e3ece09SJunchao Zhang 2490e3ece09SJunchao Zhang // Populate Tj and the permutation array 2500e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2510e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2520e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2530e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2540e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2550e3ece09SJunchao Zhang 2560e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2570e3ece09SJunchao Zhang perm[disp] = j; 2580e3ece09SJunchao Zhang offset[r]++; 259076ba34aSJunchao Zhang } 2600e3ece09SJunchao Zhang } 2610e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2620e3ece09SJunchao Zhang 2630e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2640e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2650e3ece09SJunchao Zhang 2660e3ece09SJunchao Zhang // Output perm and T on device 2670e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2680e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2690e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2700e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 272152b3e56SJunchao Zhang } 273152b3e56SJunchao Zhang 2740e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2750e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2760e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 277d71ae5a4SJacob Faibussowitsch { 2780e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2790e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2800e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2810e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 282152b3e56SJunchao Zhang 283152b3e56SJunchao Zhang PetscFunctionBegin; 2840e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 285145b44c9SPierre Jolivet PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 2860e3ece09SJunchao Zhang 2870e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2880e3ece09SJunchao Zhang 2890e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2900e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 2910e3ece09SJunchao Zhang } else { 2920e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 2930e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 2940e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 2950e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 2960e3ece09SJunchao Zhang auto &Ta = T.values; 2970e3ece09SJunchao Zhang 298d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 299076ba34aSJunchao Zhang } 3000e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3010e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3020e3ece09SJunchao Zhang 3030e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3040e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 305d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 3060e3ece09SJunchao Zhang } 3070e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 3080e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 3090e3ece09SJunchao Zhang } 3100e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3110e3ece09SJunchao Zhang } 3120e3ece09SJunchao Zhang 3130e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3140e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3150e3ece09SJunchao Zhang { 3160e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3170e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3180e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3190e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3200e3ece09SJunchao Zhang 3210e3ece09SJunchao Zhang PetscFunctionBegin; 3220e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3230e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3240e3ece09SJunchao Zhang 3250e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3260e3ece09SJunchao Zhang 3270e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3280e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3290e3ece09SJunchao Zhang } else { 3300e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3310e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3320e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3330e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3340e3ece09SJunchao Zhang auto &Ta = T.values; 3350e3ece09SJunchao Zhang 336d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3370e3ece09SJunchao Zhang } 3380e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3390e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3400e3ece09SJunchao Zhang 3410e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3420e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 343d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3440e3ece09SJunchao Zhang } 3450e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3460e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3470e3ece09SJunchao Zhang } 3483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 349152b3e56SJunchao Zhang } 350a587d139SMark 3518c3ff71bSJunchao Zhang /* y = A x */ 352d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 353d71ae5a4SJacob Faibussowitsch { 3548c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 355152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 356152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3578c3ff71bSJunchao Zhang 3588c3ff71bSJunchao Zhang PetscFunctionBegin; 3599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3619566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3629566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3638c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 364d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3669566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 367076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3718c3ff71bSJunchao Zhang } 3728c3ff71bSJunchao Zhang 3738c3ff71bSJunchao Zhang /* y = A^T x */ 374d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 375d71ae5a4SJacob Faibussowitsch { 3768c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 377152b3e56SJunchao Zhang const char *mode; 378152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 379152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3800e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3818c3ff71bSJunchao Zhang 3828c3ff71bSJunchao Zhang PetscFunctionBegin; 3839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3859566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3869566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 387152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3889566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 389152b3e56SJunchao Zhang mode = "N"; 390152b3e56SJunchao Zhang } else { 391076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3920e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 393152b3e56SJunchao Zhang mode = "T"; 394152b3e56SJunchao Zhang } 395d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3969566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3980e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 3999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4018c3ff71bSJunchao Zhang } 4028c3ff71bSJunchao Zhang 4038c3ff71bSJunchao Zhang /* y = A^H x */ 404d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 405d71ae5a4SJacob Faibussowitsch { 4068c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 407152b3e56SJunchao Zhang const char *mode; 408152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 409152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4100e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4118c3ff71bSJunchao Zhang 4128c3ff71bSJunchao Zhang PetscFunctionBegin; 4139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4159566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4169566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 417152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 419152b3e56SJunchao Zhang mode = "N"; 420152b3e56SJunchao Zhang } else { 421076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4220e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 423152b3e56SJunchao Zhang mode = "C"; 424152b3e56SJunchao Zhang } 425d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4269566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4279566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4280e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4318c3ff71bSJunchao Zhang } 4328c3ff71bSJunchao Zhang 4338c3ff71bSJunchao Zhang /* z = A x + y */ 434d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 435d71ae5a4SJacob Faibussowitsch { 4368c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 43792896123SJunchao Zhang ConstPetscScalarKokkosView xv; 438152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4398c3ff71bSJunchao Zhang 4408c3ff71bSJunchao Zhang PetscFunctionBegin; 4419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 44392896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 4449566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 44592896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 4468c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 447d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4489566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 44992896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4538c3ff71bSJunchao Zhang } 4548c3ff71bSJunchao Zhang 4558c3ff71bSJunchao Zhang /* z = A^T x + y */ 456d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 457d71ae5a4SJacob Faibussowitsch { 4588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 459152b3e56SJunchao Zhang const char *mode; 46092896123SJunchao Zhang ConstPetscScalarKokkosView xv; 461152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4620e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4638c3ff71bSJunchao Zhang 4648c3ff71bSJunchao Zhang PetscFunctionBegin; 4659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 46792896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4689566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 46992896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 470152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 472152b3e56SJunchao Zhang mode = "N"; 473152b3e56SJunchao Zhang } else { 474076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4750e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 476152b3e56SJunchao Zhang mode = "T"; 477152b3e56SJunchao Zhang } 478d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 48092896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4810e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4848c3ff71bSJunchao Zhang } 4858c3ff71bSJunchao Zhang 4868c3ff71bSJunchao Zhang /* z = A^H x + y */ 487d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 488d71ae5a4SJacob Faibussowitsch { 4898c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 490152b3e56SJunchao Zhang const char *mode; 49192896123SJunchao Zhang ConstPetscScalarKokkosView xv; 492152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4930e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4948c3ff71bSJunchao Zhang 4958c3ff71bSJunchao Zhang PetscFunctionBegin; 4969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 49892896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4999566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 50092896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 501152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 503152b3e56SJunchao Zhang mode = "N"; 504152b3e56SJunchao Zhang } else { 505076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5060e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 507152b3e56SJunchao Zhang mode = "C"; 508152b3e56SJunchao Zhang } 509d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5109566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 51192896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5120e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 515152b3e56SJunchao Zhang } 516152b3e56SJunchao Zhang 51766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 518d71ae5a4SJacob Faibussowitsch { 519152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 520152b3e56SJunchao Zhang 521152b3e56SJunchao Zhang PetscFunctionBegin; 522152b3e56SJunchao Zhang switch (op) { 523152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 524152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5259566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 526152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 527152b3e56SJunchao Zhang break; 528d71ae5a4SJacob Faibussowitsch default: 529d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 530d71ae5a4SJacob Faibussowitsch break; 531152b3e56SJunchao Zhang } 5323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5338c3ff71bSJunchao Zhang } 5348c3ff71bSJunchao Zhang 535076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 536d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 537d71ae5a4SJacob Faibussowitsch { 538076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5398c3ff71bSJunchao Zhang 5408c3ff71bSJunchao Zhang PetscFunctionBegin; 5419566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 542076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5439566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5448c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5459566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 546076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5475f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5489566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5499566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5509566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5519566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 552076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 553394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5545f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 555f4747e26SJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 5568c3ff71bSJunchao Zhang } 557076ba34aSJunchao Zhang } 5583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5598c3ff71bSJunchao Zhang } 5608c3ff71bSJunchao Zhang 561076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 562076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 563076ba34aSJunchao Zhang */ 564d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 565d71ae5a4SJacob Faibussowitsch { 566076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 567076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 568076ba34aSJunchao Zhang Mat mat; 5698c3ff71bSJunchao Zhang 5708c3ff71bSJunchao Zhang PetscFunctionBegin; 571076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5729566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 573076ba34aSJunchao Zhang mat = *B; 574f4747e26SJunchao Zhang if (A->assembled) { 575076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 576f4747e26SJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 577076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 578076ba34aSJunchao Zhang /* Now copy values to B if needed */ 579076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 580076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 581076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 582076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 583076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 584076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 585076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 586076ba34aSJunchao Zhang } 587076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 588076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 589076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 590076ba34aSJunchao Zhang } 591076ba34aSJunchao Zhang mat->spptr = bkok; 592076ba34aSJunchao Zhang } 593076ba34aSJunchao Zhang 5949566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5959566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5969566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5979566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5998c3ff71bSJunchao Zhang } 6008c3ff71bSJunchao Zhang 601d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 602d71ae5a4SJacob Faibussowitsch { 6030ecb592aSJunchao Zhang Mat At; 6040e3ece09SJunchao Zhang KokkosCsrMatrix internT; 6050ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 6060ecb592aSJunchao Zhang 6070ecb592aSJunchao Zhang PetscFunctionBegin; 6087fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 6099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6100ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 611ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6120e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6139566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6140ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6159566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6160ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6170ecb592aSJunchao Zhang if ((*B)->assembled) { 6180ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6190e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6210ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6220ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6230e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6240e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6250e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6260e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6270ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6280ecb592aSJunchao Zhang } 6293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6300ecb592aSJunchao Zhang } 6310ecb592aSJunchao Zhang 632d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 633d71ae5a4SJacob Faibussowitsch { 63486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6358c3ff71bSJunchao Zhang 6368c3ff71bSJunchao Zhang PetscFunctionBegin; 63786a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 63886a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6398c3ff71bSJunchao Zhang delete aijkok; 64086a27549SJunchao Zhang } else { 64186a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 64286a27549SJunchao Zhang } 643cbc6b225SStefano Zampini A->spptr = NULL; 6449566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6459566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6469566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 64757761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 64857761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 64957761e9aSJunchao Zhang #endif 6509566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6528c3ff71bSJunchao Zhang } 6538c3ff71bSJunchao Zhang 6543f3ba80aSJunchao Zhang /*MC 6553f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6563f3ba80aSJunchao Zhang 65715229ffcSPierre Jolivet A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 6583f3ba80aSJunchao Zhang 6592ef1f0ffSBarry Smith Options Database Key: 66011a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6613f3ba80aSJunchao Zhang 6623f3ba80aSJunchao Zhang Level: beginner 6633f3ba80aSJunchao Zhang 6641cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6653f3ba80aSJunchao Zhang M*/ 666d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 667d71ae5a4SJacob Faibussowitsch { 66886a27549SJunchao Zhang PetscFunctionBegin; 6699566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6709566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6719566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 67386a27549SJunchao Zhang } 67486a27549SJunchao Zhang 675076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 676d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 677d71ae5a4SJacob Faibussowitsch { 678076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 679076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 680076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 681076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 682076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 683076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 684a3f881fbSStefano Zampini 685a3f881fbSStefano Zampini PetscFunctionBegin; 686076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 687076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 6884f572ea9SToby Isaac PetscAssertPointer(C, 4); 689076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 690076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6915f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6925f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 693076ba34aSJunchao Zhang 6949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 696076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 697076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 698076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 699076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 700076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 701076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 702076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 703076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 704076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 705076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 706076ba34aSJunchao Zhang nnz = a->nz + b->nz; 707076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 708076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 709076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 710076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 711076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 712076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 713076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 714076ba34aSJunchao Zhang ca = ca_dual.view_device(); 715076ba34aSJunchao Zhang ci = ci_dual.view_device(); 716076ba34aSJunchao Zhang cj = cj_dual.view_device(); 717076ba34aSJunchao Zhang 718076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7199371c9d4SSatish Balay Kokkos::parallel_for( 720d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 721076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 722076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 723076ba34aSJunchao Zhang 724076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 725076ba34aSJunchao Zhang ci(i) = coffset; 726076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 727076ba34aSJunchao Zhang }); 728076ba34aSJunchao Zhang 729076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 730076ba34aSJunchao Zhang if (k < alen) { 731076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 732076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 733076ba34aSJunchao Zhang } else { 734076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 735076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 736076ba34aSJunchao Zhang } 737076ba34aSJunchao Zhang }); 738076ba34aSJunchao Zhang }); 739076ba34aSJunchao Zhang ca_dual.modify_device(); 740076ba34aSJunchao Zhang ci_dual.modify_device(); 741076ba34aSJunchao Zhang cj_dual.modify_device(); 7429566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7439566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 744076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 745076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 746076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 747076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 748076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 749076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 750076ba34aSJunchao Zhang 7519371c9d4SSatish Balay Kokkos::parallel_for( 752d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 753076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 754076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 755076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 756076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 757076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 758076ba34aSJunchao Zhang }); 759076ba34aSJunchao Zhang }); 7609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 761076ba34aSJunchao Zhang } 7623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 763076ba34aSJunchao Zhang } 764076ba34aSJunchao Zhang 765d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 766d71ae5a4SJacob Faibussowitsch { 767076ba34aSJunchao Zhang PetscFunctionBegin; 768076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 770a3f881fbSStefano Zampini } 771a3f881fbSStefano Zampini 772d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 773d71ae5a4SJacob Faibussowitsch { 774a3f881fbSStefano Zampini Mat_Product *product = C->product; 775a3f881fbSStefano Zampini Mat A, B; 776076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 777a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 778a3f881fbSStefano Zampini Mat_SeqAIJ *c; 779076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7800e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 781a3f881fbSStefano Zampini 782a3f881fbSStefano Zampini PetscFunctionBegin; 783a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7845f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 785076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 786076ba34aSJunchao Zhang 7870e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 7880e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 7890e3ece09SJunchao Zhang // we still do numeric. 7900e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 7910e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 7923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 793076ba34aSJunchao Zhang } 794076ba34aSJunchao Zhang 795076ba34aSJunchao Zhang switch (product->type) { 7969371c9d4SSatish Balay case MATPRODUCT_AB: 7979371c9d4SSatish Balay transA = false; 7989371c9d4SSatish Balay transB = false; 7999371c9d4SSatish Balay break; 8009371c9d4SSatish Balay case MATPRODUCT_AtB: 8019371c9d4SSatish Balay transA = true; 8029371c9d4SSatish Balay transB = false; 8039371c9d4SSatish Balay break; 8049371c9d4SSatish Balay case MATPRODUCT_ABt: 8059371c9d4SSatish Balay transA = false; 8069371c9d4SSatish Balay transB = true; 8079371c9d4SSatish Balay break; 808d71ae5a4SJacob Faibussowitsch default: 809d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 810076ba34aSJunchao Zhang } 811076ba34aSJunchao Zhang 812a3f881fbSStefano Zampini A = product->A; 813a3f881fbSStefano Zampini B = product->B; 8149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 816a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 817a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 818a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 819076ba34aSJunchao Zhang 8205f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 821076ba34aSJunchao Zhang 8220e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8230e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 824076ba34aSJunchao Zhang 825076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 826076ba34aSJunchao Zhang if (transA) { 8279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 828076ba34aSJunchao Zhang transA = false; 829a3f881fbSStefano Zampini } 830a3f881fbSStefano Zampini 831076ba34aSJunchao Zhang if (transB) { 8329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 833076ba34aSJunchao Zhang transB = false; 834076ba34aSJunchao Zhang } 8359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8360e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8370e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 838866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 839866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 840e944a159SJunchao Zhang #endif 841866eb059SJunchao Zhang 8429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 844a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 845a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8469566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8479566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8489566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 849a3f881fbSStefano Zampini c->reallocs = 0; 850076ba34aSJunchao Zhang C->info.mallocs = 0; 851a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 852a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 853a3f881fbSStefano Zampini C->num_ass++; 8543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 855a3f881fbSStefano Zampini } 856a3f881fbSStefano Zampini 857d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 858d71ae5a4SJacob Faibussowitsch { 859076ba34aSJunchao Zhang Mat_Product *product = C->product; 860076ba34aSJunchao Zhang MatProductType ptype; 861076ba34aSJunchao Zhang Mat A, B; 862076ba34aSJunchao Zhang bool transA, transB; 863076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 864076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 865076ba34aSJunchao Zhang MPI_Comm comm; 8660e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 867a3f881fbSStefano Zampini 868a3f881fbSStefano Zampini PetscFunctionBegin; 869a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8709566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8715f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 872a3f881fbSStefano Zampini A = product->A; 873a3f881fbSStefano Zampini B = product->B; 8749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 876a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 877a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8780e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8790e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 880076ba34aSJunchao Zhang 881a3f881fbSStefano Zampini ptype = product->type; 8820e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8830e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 8840e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8850e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 8860e3ece09SJunchao Zhang } 8870e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 8880e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8890e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 8900e3ece09SJunchao Zhang } 8910e3ece09SJunchao Zhang 892a3f881fbSStefano Zampini switch (ptype) { 8939371c9d4SSatish Balay case MATPRODUCT_AB: 8949371c9d4SSatish Balay transA = false; 8959371c9d4SSatish Balay transB = false; 8960e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 8979371c9d4SSatish Balay break; 8989371c9d4SSatish Balay case MATPRODUCT_AtB: 8999371c9d4SSatish Balay transA = true; 9009371c9d4SSatish Balay transB = false; 9010e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 9020e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 9039371c9d4SSatish Balay break; 9049371c9d4SSatish Balay case MATPRODUCT_ABt: 9059371c9d4SSatish Balay transA = false; 9069371c9d4SSatish Balay transB = true; 9070e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 9080e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 9099371c9d4SSatish Balay break; 910d71ae5a4SJacob Faibussowitsch default: 911d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 912a3f881fbSStefano Zampini } 9130e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 914076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 915a3f881fbSStefano Zampini 916076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 917866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 918866eb059SJunchao Zhang 919866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 920866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 921866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 922866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 923866eb059SJunchao Zhang #endif 924866eb059SJunchao Zhang #endif 9250e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 926076ba34aSJunchao Zhang 9279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 928076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 929076ba34aSJunchao Zhang if (transA) { 9309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 931076ba34aSJunchao Zhang transA = false; 932076ba34aSJunchao Zhang } 933076ba34aSJunchao Zhang 934076ba34aSJunchao Zhang if (transB) { 9359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 936076ba34aSJunchao Zhang transB = false; 937076ba34aSJunchao Zhang } 938076ba34aSJunchao Zhang 9390e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 940076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 941076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 942076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 943076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 944076ba34aSJunchao Zhang */ 9450e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9460e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 947866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 948866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 949866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 950e944a159SJunchao Zhang #endif 9519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 952076ba34aSJunchao Zhang 9539566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9549566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 955076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 957a3f881fbSStefano Zampini } 958a3f881fbSStefano Zampini 959a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 960d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 961d71ae5a4SJacob Faibussowitsch { 962076ba34aSJunchao Zhang Mat_Product *product = mat->product; 963a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 964a3f881fbSStefano Zampini 965a3f881fbSStefano Zampini PetscFunctionBegin; 966a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9679566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 96848a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 969a3f881fbSStefano Zampini if (Biskok && Ciskok) { 970a3f881fbSStefano Zampini switch (product->type) { 971a3f881fbSStefano Zampini case MATPRODUCT_AB: 972a3f881fbSStefano Zampini case MATPRODUCT_AtB: 973d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 974d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 975d71ae5a4SJacob Faibussowitsch break; 976a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 977a3f881fbSStefano Zampini case MATPRODUCT_RARt: 978d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 979d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 980d71ae5a4SJacob Faibussowitsch break; 981d71ae5a4SJacob Faibussowitsch default: 982d71ae5a4SJacob Faibussowitsch break; 983a3f881fbSStefano Zampini } 984a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9859566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 986a3f881fbSStefano Zampini } 9873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 988a3f881fbSStefano Zampini } 989a587d139SMark 990d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 991d71ae5a4SJacob Faibussowitsch { 992f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 993f0cf5187SStefano Zampini 994f0cf5187SStefano Zampini PetscFunctionBegin; 9959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 997f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 998d326c3f1SJunchao Zhang KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 10019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1003f0cf5187SStefano Zampini } 1004f0cf5187SStefano Zampini 1005f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1006f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1007f4747e26SJunchao Zhang { 1008f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1009f4747e26SJunchao Zhang 1010f4747e26SJunchao Zhang PetscFunctionBegin; 1011f4747e26SJunchao Zhang if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1012f4747e26SJunchao Zhang PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1013f4747e26SJunchao Zhang 1014f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1015f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1016f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1017f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1018f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1019d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1020f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1021f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1022f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1023f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1024f4747e26SJunchao Zhang PetscCall(MatShift_Basic(A, a)); 1025f4747e26SJunchao Zhang } 1026f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1027f4747e26SJunchao Zhang } 1028f4747e26SJunchao Zhang 1029f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1030f4747e26SJunchao Zhang { 1031f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1032f4747e26SJunchao Zhang 1033f4747e26SJunchao Zhang PetscFunctionBegin; 1034f4747e26SJunchao Zhang if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1035f4747e26SJunchao Zhang ConstPetscScalarKokkosView dv; 1036f4747e26SJunchao Zhang PetscInt n, nv; 1037f4747e26SJunchao Zhang 1038f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1039f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1040f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(D, &dv)); 1041f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(D, &nv)); 1042f4747e26SJunchao Zhang n = PetscMin(Y->rmap->n, Y->cmap->n); 1043f4747e26SJunchao Zhang PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1044f4747e26SJunchao Zhang 1045f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1046f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1047f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1048f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1049d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1050f4747e26SJunchao Zhang if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1051f4747e26SJunchao Zhang else Aa(Adiag(i)) += dv(i); 1052f4747e26SJunchao Zhang })); 1053f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(D, &dv)); 1054f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1055f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1056f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1057f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1058f4747e26SJunchao Zhang PetscCall(MatDiagonalSet_Default(Y, D, is)); 1059f4747e26SJunchao Zhang } 1060f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1061f4747e26SJunchao Zhang } 1062f4747e26SJunchao Zhang 1063f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1064f4747e26SJunchao Zhang { 1065f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1066f4747e26SJunchao Zhang PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1067f4747e26SJunchao Zhang ConstPetscScalarKokkosView lv, rv; 1068f4747e26SJunchao Zhang 1069f4747e26SJunchao Zhang PetscFunctionBegin; 1070f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1071f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1072f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1073f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1074f4747e26SJunchao Zhang const auto &Ai = aijkok->i_dual.view_device(); 1075f4747e26SJunchao Zhang const auto &Aj = aijkok->j_dual.view_device(); 1076f4747e26SJunchao Zhang if (ll) { 1077f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(ll, &m)); 1078f4747e26SJunchao Zhang PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1079f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(ll, &lv)); 1080f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each row 1081d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1082f4747e26SJunchao Zhang PetscInt i = t.league_rank(); // row i 1083f4747e26SJunchao Zhang PetscInt len = Ai(i + 1) - Ai(i); 1084f4747e26SJunchao Zhang // scale entries on the row 1085f4747e26SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1086f4747e26SJunchao Zhang })); 1087f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(ll, &lv)); 1088f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1089f4747e26SJunchao Zhang } 1090f4747e26SJunchao Zhang if (rr) { 1091f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(rr, &n)); 1092f4747e26SJunchao Zhang PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1093f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(rr, &rv)); 1094f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1095d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1096f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(rr, &lv)); 1097f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1098f4747e26SJunchao Zhang } 1099f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1100f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1101f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1102f4747e26SJunchao Zhang } 1103f4747e26SJunchao Zhang 1104d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1105d71ae5a4SJacob Faibussowitsch { 1106076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1107a587d139SMark 1108a587d139SMark PetscFunctionBegin; 1109076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11102328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1111d326c3f1SJunchao Zhang KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 11129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11132328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 11149566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 11152328674fSJunchao Zhang } 11163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1117a587d139SMark } 1118a587d139SMark 1119d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1120d71ae5a4SJacob Faibussowitsch { 1121f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1122f78ce678SMark Adams PetscInt n; 1123f78ce678SMark Adams PetscScalarKokkosView xv; 1124f78ce678SMark Adams 1125f78ce678SMark Adams PetscFunctionBegin; 1126f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1127f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1128f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1129f78ce678SMark Adams 1130f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1131f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1132f78ce678SMark Adams 1133f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1134f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1135f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1136f78ce678SMark Adams 1137f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 11389371c9d4SSatish Balay Kokkos::parallel_for( 1139d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1140f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1141f78ce678SMark Adams else xv(i) = 0; 1142f78ce678SMark Adams }); 1143f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 11443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1145f78ce678SMark Adams } 1146f78ce678SMark Adams 1147db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1148d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1149d71ae5a4SJacob Faibussowitsch { 1150db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1151db78de30SJunchao Zhang 1152db78de30SJunchao Zhang PetscFunctionBegin; 1153db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11544f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1155db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1157db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1158076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1160db78de30SJunchao Zhang } 1161db78de30SJunchao Zhang 1162d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1163d71ae5a4SJacob Faibussowitsch { 1164db78de30SJunchao Zhang PetscFunctionBegin; 1165db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11664f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1167db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1169db78de30SJunchao Zhang } 1170db78de30SJunchao Zhang 1171d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1172d71ae5a4SJacob Faibussowitsch { 1173db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1174db78de30SJunchao Zhang 1175db78de30SJunchao Zhang PetscFunctionBegin; 1176db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11774f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1178db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1180db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1181076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1183db78de30SJunchao Zhang } 1184db78de30SJunchao Zhang 1185d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1186d71ae5a4SJacob Faibussowitsch { 1187db78de30SJunchao Zhang PetscFunctionBegin; 1188db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11894f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1190db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1193db78de30SJunchao Zhang } 1194db78de30SJunchao Zhang 1195d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1196d71ae5a4SJacob Faibussowitsch { 1197db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1198db78de30SJunchao Zhang 1199db78de30SJunchao Zhang PetscFunctionBegin; 1200db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12014f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1202db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1203db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1204076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1206db78de30SJunchao Zhang } 1207db78de30SJunchao Zhang 1208d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1209d71ae5a4SJacob Faibussowitsch { 1210db78de30SJunchao Zhang PetscFunctionBegin; 1211db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12124f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1213db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1216db78de30SJunchao Zhang } 1217db78de30SJunchao Zhang 1218*c0c276a7Ssdargavi PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A) 1219*c0c276a7Ssdargavi { 1220*c0c276a7Ssdargavi Mat_SeqAIJKokkos *akok; 1221*c0c276a7Ssdargavi 1222*c0c276a7Ssdargavi PetscFunctionBegin; 1223*c0c276a7Ssdargavi auto exec = PetscGetKokkosExecutionSpace(); 1224*c0c276a7Ssdargavi // Create host copies of the input aij 1225*c0c276a7Ssdargavi auto i_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), i_d); 1226*c0c276a7Ssdargavi auto j_h = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), j_d); 1227*c0c276a7Ssdargavi // Don't copy the vals to the host now 1228*c0c276a7Ssdargavi auto a_h = Kokkos::create_mirror_view(HostMirrorMemorySpace(), a_d); 1229*c0c276a7Ssdargavi 1230*c0c276a7Ssdargavi MatScalarKokkosDualView a_dual = MatScalarKokkosDualView(a_d, a_h); 1231*c0c276a7Ssdargavi // Note we have modified device data so it will copy lazily 1232*c0c276a7Ssdargavi a_dual.modify_device(); 1233*c0c276a7Ssdargavi MatRowMapKokkosDualView i_dual = MatRowMapKokkosDualView(i_d, i_h); 1234*c0c276a7Ssdargavi MatColIdxKokkosDualView j_dual = MatColIdxKokkosDualView(j_d, j_h); 1235*c0c276a7Ssdargavi 1236*c0c276a7Ssdargavi PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_dual.extent(0), i_dual, j_dual, a_dual)); 1237*c0c276a7Ssdargavi PetscCall(MatCreate(comm, A)); 1238*c0c276a7Ssdargavi PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 1239*c0c276a7Ssdargavi PetscFunctionReturn(PETSC_SUCCESS); 1240*c0c276a7Ssdargavi } 1241*c0c276a7Ssdargavi 1242c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1243d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1244d71ae5a4SJacob Faibussowitsch { 1245a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1246c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1247c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1248c17cf699SJunchao Zhang MatScalarKokkosView Ya; 12494df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1250a587d139SMark 1251a587d139SMark PetscFunctionBegin; 1252c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1253c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 12549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 12559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 12569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1257db78de30SJunchao Zhang 1258c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1259a587d139SMark PetscBool e; 12609566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1261a587d139SMark if (e) { 12629566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1263c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1264a587d139SMark } 1265a587d139SMark } 1266db78de30SJunchao Zhang 1267c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1268c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1269c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1270c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1271c17cf699SJunchao Zhang */ 1272c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1273c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1274c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1275c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1276c17cf699SJunchao Zhang 1277c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1278d326c3f1SJunchao Zhang KokkosBlas::axpy(exec, alpha, Xa, Ya); 12799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1280c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1281c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1282c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1283c17cf699SJunchao Zhang 12849371c9d4SSatish Balay Kokkos::parallel_for( 1285d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 12860e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 12870e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 12880e3ece09SJunchao Zhang // Only one thread works in a team 1289c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 12900e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 12910e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 12920e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1293c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1294c17cf699SJunchao Zhang q++; 1295a587d139SMark } else { 12960e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 12970e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 12980e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 12990e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 13008b8b16f9SJunchao Zhang #else 13010e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 13028b8b16f9SJunchao Zhang #endif 1303a587d139SMark } 1304c17cf699SJunchao Zhang } 1305c17cf699SJunchao Zhang }); 1306c17cf699SJunchao Zhang }); 13079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 13080e3ece09SJunchao Zhang } else { // different nonzero patterns 1309c17cf699SJunchao Zhang Mat Z; 1310c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1311c17cf699SJunchao Zhang KernelHandle kh; 13120e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1313c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1314c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1315c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 13169566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 13179566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1318c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1319c17cf699SJunchao Zhang } 13209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 13210e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 13223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1323a587d139SMark } 1324a587d139SMark 13252c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 13262c4ab24aSJunchao Zhang PetscCount n; 13272c4ab24aSJunchao Zhang PetscCount Atot; 13282c4ab24aSJunchao Zhang PetscInt nz; 13292c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 13302c4ab24aSJunchao Zhang PetscCountKokkosView perm; 13312c4ab24aSJunchao Zhang 13322c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 13332c4ab24aSJunchao Zhang { 13342c4ab24aSJunchao Zhang nz = coo_h->nz; 13352c4ab24aSJunchao Zhang n = coo_h->n; 13362c4ab24aSJunchao Zhang Atot = coo_h->Atot; 13372c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 13382c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 13392c4ab24aSJunchao Zhang } 13402c4ab24aSJunchao Zhang }; 13412c4ab24aSJunchao Zhang 134249abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 13432c4ab24aSJunchao Zhang { 13442c4ab24aSJunchao Zhang PetscFunctionBegin; 134549abdd8aSBarry Smith PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 13462c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13472c4ab24aSJunchao Zhang } 13482c4ab24aSJunchao Zhang 1349d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1350d71ae5a4SJacob Faibussowitsch { 135142550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 135242550becSJunchao Zhang Mat_SeqAIJ *aseq; 135303e76207SPierre Jolivet PetscContainer container_h; 13542c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 13552c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 135642550becSJunchao Zhang 135742550becSJunchao Zhang PetscFunctionBegin; 13589566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1359394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 136042550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1361cbc6b225SStefano Zampini delete akok; 1362f4747e26SJunchao Zhang mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 13639566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 13642c4ab24aSJunchao Zhang 13652c4ab24aSJunchao Zhang // Copy the COO struct to device 13662c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 13672c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 13682c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 13692c4ab24aSJunchao Zhang 13702c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 137103e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 13723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 137342550becSJunchao Zhang } 137442550becSJunchao Zhang 1375d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1376d71ae5a4SJacob Faibussowitsch { 137742550becSJunchao Zhang MatScalarKokkosView Aa; 137842550becSJunchao Zhang ConstMatScalarKokkosView kv; 137942550becSJunchao Zhang PetscMemType memtype; 13802c4ab24aSJunchao Zhang PetscContainer container; 13812c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 138242550becSJunchao Zhang 138342550becSJunchao Zhang PetscFunctionBegin; 13842c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 13852c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 13862c4ab24aSJunchao Zhang 13872c4ab24aSJunchao Zhang const auto &n = coo->n; 13882c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 13892c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 13902c4ab24aSJunchao Zhang const auto &perm = coo->perm; 13912c4ab24aSJunchao Zhang 13929566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 139342550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 13942c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 139542550becSJunchao Zhang } else { 13962c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 139742550becSJunchao Zhang } 139842550becSJunchao Zhang 1399c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1400c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 140142550becSJunchao Zhang 140208bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 14039371c9d4SSatish Balay Kokkos::parallel_for( 1404d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1405c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1406c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1407c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1408c7b718f4SJunchao Zhang }); 140908bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1410394ed5ebSJunchao Zhang 14119566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 14129566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 14133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 141442550becSJunchao Zhang } 141542550becSJunchao Zhang 1416d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1417d71ae5a4SJacob Faibussowitsch { 14188f7e8f9dSMark Adams PetscFunctionBegin; 14199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 14209566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 14218f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 14223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14238f7e8f9dSMark Adams } 14248f7e8f9dSMark Adams 1425d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1426d71ae5a4SJacob Faibussowitsch { 1427076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1428076ba34aSJunchao Zhang 14298c3ff71bSJunchao Zhang PetscFunctionBegin; 1430076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 14316f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 14326f3d89d0SStefano Zampini 14338c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 14348c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 14358c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1436a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1437f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1438a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1439076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 14408c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 14418c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 14428c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 14438c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 14448c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 14458c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1446076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 14470ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1448152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1449f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1450f4747e26SJunchao Zhang A->ops->shift = MatShift_SeqAIJKokkos; 1451f4747e26SJunchao Zhang A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1452f4747e26SJunchao Zhang A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1453076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1454076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1455076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1456076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1457076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1458076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 14597ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 146042550becSJunchao Zhang 14619566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 14629566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 146357761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 146457761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 146557761e9aSJunchao Zhang #endif 14663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1467076ba34aSJunchao Zhang } 1468076ba34aSJunchao Zhang 14699d13fa56SJunchao Zhang /* 14709d13fa56SJunchao Zhang Extract the (prescribled) diagonal blocks of the matrix and then invert them 14719d13fa56SJunchao Zhang 14729d13fa56SJunchao Zhang Input Parameters: 14739d13fa56SJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 14749d13fa56SJunchao Zhang . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 14759d13fa56SJunchao Zhang . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 14769d13fa56SJunchao Zhang . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 14779d13fa56SJunchao Zhang - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 14789d13fa56SJunchao Zhang 14799d13fa56SJunchao Zhang Output Parameter: 14809d13fa56SJunchao Zhang . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 14819d13fa56SJunchao Zhang */ 14829d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 14839d13fa56SJunchao Zhang { 14849d13fa56SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 14859d13fa56SJunchao Zhang PetscInt nblocks = bs.extent(0) - 1; 14869d13fa56SJunchao Zhang 14879d13fa56SJunchao Zhang PetscFunctionBegin; 14889d13fa56SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 14899d13fa56SJunchao Zhang 14909d13fa56SJunchao Zhang // Pull out the diagonal blocks of the matrix and then invert the blocks 14919d13fa56SJunchao Zhang auto Aa = akok->a_dual.view_device(); 14929d13fa56SJunchao Zhang auto Ai = akok->i_dual.view_device(); 14939d13fa56SJunchao Zhang auto Aj = akok->j_dual.view_device(); 14949d13fa56SJunchao Zhang auto Adiag = akok->diag_dual.view_device(); 14959d13fa56SJunchao Zhang // TODO: how to tune the team size? 149645402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 14979d13fa56SJunchao Zhang auto ts = Kokkos::AUTO(); 14989d13fa56SJunchao Zhang #else 14999d13fa56SJunchao Zhang auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 15009d13fa56SJunchao Zhang #endif 15019d13fa56SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1502d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 15039d13fa56SJunchao Zhang const PetscInt bid = teamMember.league_rank(); // block id 15049d13fa56SJunchao Zhang const PetscInt rstart = bs(bid); // this block starts from this row 15059d13fa56SJunchao Zhang const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 15069d13fa56SJunchao Zhang const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 15079d13fa56SJunchao Zhang const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 15089d13fa56SJunchao Zhang 15099d13fa56SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 15109d13fa56SJunchao Zhang PetscInt i = rstart + r; // i-th row in A 15119d13fa56SJunchao Zhang 15129d13fa56SJunchao Zhang if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 15139d13fa56SJunchao Zhang PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 15149d13fa56SJunchao Zhang 15159d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 15169d13fa56SJunchao Zhang if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 15179d13fa56SJunchao Zhang B(r, c) = 0.0; 15189d13fa56SJunchao Zhang } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 15199d13fa56SJunchao Zhang B(r, c) = Aa(first + c); 15209d13fa56SJunchao Zhang } else { // this entry does not show up in the CSR 15219d13fa56SJunchao Zhang B(r, c) = 0.0; 15229d13fa56SJunchao Zhang } 15239d13fa56SJunchao Zhang } 15249d13fa56SJunchao Zhang } else { // rare case that the diagonal does not exist 15259d13fa56SJunchao Zhang const PetscInt begin = Ai(i); 15269d13fa56SJunchao Zhang const PetscInt end = Ai(i + 1); 15279d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 15289d13fa56SJunchao Zhang for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 15299d13fa56SJunchao Zhang if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 15309d13fa56SJunchao Zhang else if (Aj(j) >= rstart + m) break; 15319d13fa56SJunchao Zhang } 15329d13fa56SJunchao Zhang } 15339d13fa56SJunchao Zhang }); 15349d13fa56SJunchao Zhang 15359d13fa56SJunchao Zhang // LU-decompose B (w/o pivoting) and then invert B 15369d13fa56SJunchao Zhang KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 15379d13fa56SJunchao Zhang KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 15389d13fa56SJunchao Zhang })); 15399d13fa56SJunchao Zhang // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 15409d13fa56SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15419d13fa56SJunchao Zhang } 15429d13fa56SJunchao Zhang 1543d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1544d71ae5a4SJacob Faibussowitsch { 1545076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1546076ba34aSJunchao Zhang PetscInt i, m, n; 15474df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1548076ba34aSJunchao Zhang 1549076ba34aSJunchao Zhang PetscFunctionBegin; 15505f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1551076ba34aSJunchao Zhang 1552076ba34aSJunchao Zhang m = akok->nrows(); 1553076ba34aSJunchao Zhang n = akok->ncols(); 15549566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 15559566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1556076ba34aSJunchao Zhang 1557076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 15589566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 155957508eceSPierre Jolivet aseq = (Mat_SeqAIJ *)A->data; 1560076ba34aSJunchao Zhang 1561e36ced11SJunchao Zhang PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1562e36ced11SJunchao Zhang PetscCallCXX(akok->j_dual.sync_host(exec)); 1563e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1564076ba34aSJunchao Zhang 1565076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1566076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1567076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1568076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1569076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1570076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1571076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1572076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1573076ba34aSJunchao Zhang 15749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 15759566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1576ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1577076ba34aSJunchao Zhang 1578076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1579076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1580ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 15819566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 15829566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 15833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1584076ba34aSJunchao Zhang } 1585076ba34aSJunchao Zhang 15860e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 15870e3ece09SJunchao Zhang { 15880e3ece09SJunchao Zhang PetscFunctionBegin; 15890e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 15900e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 15910e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15920e3ece09SJunchao Zhang } 15930e3ece09SJunchao Zhang 15940e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 15950e3ece09SJunchao Zhang { 15960e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 15974d86920dSPierre Jolivet 15980e3ece09SJunchao Zhang PetscFunctionBegin; 15990e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 16000e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 16010e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16020e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 16030e3ece09SJunchao Zhang } 16040e3ece09SJunchao Zhang 1605076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1606076ba34aSJunchao Zhang 1607076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1608076ba34aSJunchao Zhang */ 1609d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1610d71ae5a4SJacob Faibussowitsch { 1611076ba34aSJunchao Zhang PetscFunctionBegin; 16129566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16139566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 16143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16158c3ff71bSJunchao Zhang } 16168c3ff71bSJunchao Zhang 1617152b3e56SJunchao Zhang /*@C 161811a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 16198c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 162020f4b53cSBarry Smith Kokkos for calculations. 16218c3ff71bSJunchao Zhang 16228c3ff71bSJunchao Zhang Collective 16238c3ff71bSJunchao Zhang 16248c3ff71bSJunchao Zhang Input Parameters: 162511a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 16268c3ff71bSJunchao Zhang . m - number of rows 16278c3ff71bSJunchao Zhang . n - number of columns 162820f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 162920f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 16308c3ff71bSJunchao Zhang 16318c3ff71bSJunchao Zhang Output Parameter: 16328c3ff71bSJunchao Zhang . A - the matrix 16338c3ff71bSJunchao Zhang 16342ef1f0ffSBarry Smith Level: intermediate 16352ef1f0ffSBarry Smith 16362ef1f0ffSBarry Smith Notes: 163711a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 16388c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 163911a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 16408c3ff71bSJunchao Zhang 164111a5261eSBarry Smith The AIJ format, also called 16422ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 16438c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 164420f4b53cSBarry Smith either one (as in Fortran) or zero. 16458c3ff71bSJunchao Zhang 16462ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 16472ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 16482ef1f0ffSBarry Smith allocation. 16498c3ff71bSJunchao Zhang 1650fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 16518c3ff71bSJunchao Zhang @*/ 1652d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1653d71ae5a4SJacob Faibussowitsch { 16548c3ff71bSJunchao Zhang PetscFunctionBegin; 16559566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 16569566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16579566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 16589566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 16599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 16603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16618c3ff71bSJunchao Zhang } 1662930e68a5SMark Adams 1663d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1664d71ae5a4SJacob Faibussowitsch { 1665930e68a5SMark Adams PetscFunctionBegin; 16669566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 166786a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 16683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 166986a27549SJunchao Zhang } 167086a27549SJunchao Zhang 1671d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1672d71ae5a4SJacob Faibussowitsch { 167386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 167486a27549SJunchao Zhang 167586a27549SJunchao Zhang PetscFunctionBegin; 167686a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 167786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 167886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 167986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 168086a27549SJunchao Zhang } 16813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168286a27549SJunchao Zhang } 168386a27549SJunchao Zhang 168486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1685d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1686d71ae5a4SJacob Faibussowitsch { 168786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1688076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 168986a27549SJunchao Zhang 169086a27549SJunchao Zhang PetscFunctionBegin; 169186a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 169286a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 16937b8d4ba6SJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 16947b8d4ba6SJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 16957b8d4ba6SJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 169686a27549SJunchao Zhang 16979371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 169886a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 169986a27549SJunchao Zhang 170086a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 170186a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 170286a27549SJunchao Zhang */ 17039371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 170486a27549SJunchao Zhang 170586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 170686a27549SJunchao Zhang 170786a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 17087b8d4ba6SJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 17097b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 17107b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 171186a27549SJunchao Zhang 17129371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 171386a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 171486a27549SJunchao Zhang 171586a27549SJunchao Zhang /* Sort indices. See comments above */ 17169371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 171786a27549SJunchao Zhang 171886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 171986a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 172086a27549SJunchao Zhang } 17213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 172286a27549SJunchao Zhang } 172386a27549SJunchao Zhang 172486a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1725d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1726d71ae5a4SJacob Faibussowitsch { 172786a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 172886a27549SJunchao Zhang PetscScalarKokkosView xv; 172986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 173086a27549SJunchao Zhang 173186a27549SJunchao Zhang PetscFunctionBegin; 17329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 17349566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17359566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 173686a27549SJunchao Zhang /* Solve L tmpv = b */ 17379566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 173886a27549SJunchao Zhang /* Solve Ux = tmpv */ 17399566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 17409566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17419566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 174486a27549SJunchao Zhang } 174586a27549SJunchao Zhang 1746076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1747d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1748d71ae5a4SJacob Faibussowitsch { 174986a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 175086a27549SJunchao Zhang PetscScalarKokkosView xv; 175186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 175286a27549SJunchao Zhang 175386a27549SJunchao Zhang PetscFunctionBegin; 17549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 17569566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17579566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 175886a27549SJunchao Zhang /* Solve U^T tmpv = b */ 175986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 176086a27549SJunchao Zhang 176186a27549SJunchao Zhang /* Solve L^T x = tmpv */ 176286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 17639566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 176786a27549SJunchao Zhang } 176886a27549SJunchao Zhang 1769d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1770d71ae5a4SJacob Faibussowitsch { 177186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 177286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 177386a27549SJunchao Zhang PetscInt fill_lev = info->levels; 177486a27549SJunchao Zhang 177586a27549SJunchao Zhang PetscFunctionBegin; 17769566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1778076ba34aSJunchao Zhang 1779076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1780076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1781076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1782076ba34aSJunchao Zhang 1783076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 178486a27549SJunchao Zhang 178586a27549SJunchao Zhang B->assembled = PETSC_TRUE; 178686a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 178786a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 178886a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 178986a27549SJunchao Zhang B->ops->matsolve = NULL; 179086a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 179186a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 179286a27549SJunchao Zhang 179386a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 179486a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 179586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1796eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 17979566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 179986a27549SJunchao Zhang } 180086a27549SJunchao Zhang 1801d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1802d71ae5a4SJacob Faibussowitsch { 180386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 180486a27549SJunchao Zhang Mat_SeqAIJ *b; 180586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 180686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 180786a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 180886a27549SJunchao Zhang PetscInt n = A->rmap->n; 180986a27549SJunchao Zhang 181086a27549SJunchao Zhang PetscFunctionBegin; 18119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 181286a27549SJunchao Zhang /* Rebuild factors */ 18139371c9d4SSatish Balay if (factors) { 18149371c9d4SSatish Balay factors->Destroy(); 18159371c9d4SSatish Balay } /* Destroy the old if it exists */ 18169371c9d4SSatish Balay else { 18179371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 18189371c9d4SSatish Balay } 181986a27549SJunchao Zhang 182086a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 182186a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 182286a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 182386a27549SJunchao Zhang 182486a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 182586a27549SJunchao Zhang 182686a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 182786a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 182886a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 182986a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 183086a27549SJunchao Zhang 183186a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1832076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1833076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1834076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 183586a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 183686a27549SJunchao Zhang 183786a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 183886a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 183986a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 184086a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 184186a27549SJunchao Zhang 184286a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 184386a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 184486a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 184586a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 184686a27549SJunchao Zhang #else 184786a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 184886a27549SJunchao Zhang #endif 184986a27549SJunchao Zhang 185086a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 185186a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 185286a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 185386a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 185486a27549SJunchao Zhang 185586a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 18569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 185786a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 185886a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 185986a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1860a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 186186a27549SJunchao Zhang 186286a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 186386a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 18643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1865930e68a5SMark Adams } 1866930e68a5SMark Adams 1867d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1868d71ae5a4SJacob Faibussowitsch { 1869930e68a5SMark Adams PetscFunctionBegin; 1870930e68a5SMark Adams *type = MATSOLVERKOKKOS; 18713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1872930e68a5SMark Adams } 1873930e68a5SMark Adams 1874930e68a5SMark Adams /*MC 187586a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 187611a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1877930e68a5SMark Adams 1878930e68a5SMark Adams Level: beginner 1879930e68a5SMark Adams 18801cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1881930e68a5SMark Adams M*/ 188286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1883930e68a5SMark Adams { 1884930e68a5SMark Adams PetscInt n = A->rmap->n; 1885930e68a5SMark Adams 1886930e68a5SMark Adams PetscFunctionBegin; 18879566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 18889566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1889930e68a5SMark Adams (*B)->factortype = ftype; 18909566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 18919566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1892930e68a5SMark Adams 18938f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18949566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 189586a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 189686a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 189786a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 18989566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 189986a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 190086a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 190198921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1902930e68a5SMark Adams 19039566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1904f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 19053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1906930e68a5SMark Adams } 19078f7e8f9dSMark Adams 1908d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1909d71ae5a4SJacob Faibussowitsch { 191086a27549SJunchao Zhang PetscFunctionBegin; 19119566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 19129566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 19133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 191486a27549SJunchao Zhang } 191586a27549SJunchao Zhang 1916076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1917d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1918d71ae5a4SJacob Faibussowitsch { 191945402d8aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 192045402d8aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 192145402d8aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 1922076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1923076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1924076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1925076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1926076ba34aSJunchao Zhang 1927076ba34aSJunchao Zhang PetscFunctionBegin; 19289566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1929076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 19309566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 193148a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 19329566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1933076ba34aSJunchao Zhang } 19343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1935076ba34aSJunchao Zhang } 1936