111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 188c3ff71bSJunchao Zhang 190e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 23f98996d3SJunchao Zhang #else 24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 27f98996d3SJunchao Zhang #endif 28f98996d3SJunchao Zhang 298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 308c3ff71bSJunchao Zhang 31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 34076ba34aSJunchao Zhang */ 35d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36d71ae5a4SJacob Faibussowitsch { 37076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 38076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 398c3ff71bSJunchao Zhang 408c3ff71bSJunchao Zhang PetscFunctionBegin; 413ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 429566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43076ba34aSJunchao Zhang 44076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46076ba34aSJunchao Zhang 47076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 48076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 50076ba34aSJunchao Zhang */ 51076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52076ba34aSJunchao Zhang delete aijkok; 53076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54076ba34aSJunchao Zhang A->spptr = aijkok; 55076ba34aSJunchao Zhang } 56076ba34aSJunchao Zhang 573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 588c3ff71bSJunchao Zhang } 598c3ff71bSJunchao Zhang 6086a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 61d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 62d71ae5a4SJacob Faibussowitsch { 638c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 648c3ff71bSJunchao Zhang 658c3ff71bSJunchao Zhang PetscFunctionBegin; 66aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 675f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 68076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 69076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 70580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7186a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 728c3ff71bSJunchao Zhang } 733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748c3ff71bSJunchao Zhang } 758c3ff71bSJunchao Zhang 76076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 77d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 78d71ae5a4SJacob Faibussowitsch { 7986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8086a27549SJunchao Zhang 8186a27549SJunchao Zhang PetscFunctionBegin; 825f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8386a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8486a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8586a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 889566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9086a27549SJunchao Zhang } 9186a27549SJunchao Zhang 92d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 93d71ae5a4SJacob Faibussowitsch { 94f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95f0cf5187SStefano Zampini 96f0cf5187SStefano Zampini PetscFunctionBegin; 97f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9886a27549SJunchao Zhang /* We do not expect one needs factors on host */ 99aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1005f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 101076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 1023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 103f0cf5187SStefano Zampini } 104f0cf5187SStefano Zampini 105d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 106d71ae5a4SJacob Faibussowitsch { 107076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 108f0cf5187SStefano Zampini 109f0cf5187SStefano Zampini PetscFunctionBegin; 1105519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1115519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1125519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1135519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1145519a089SJose E. Roman */ 1155519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 116076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 117076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 118076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 119076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 120076ba34aSJunchao Zhang } 1213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122076ba34aSJunchao Zhang } 123076ba34aSJunchao Zhang 124d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 125d71ae5a4SJacob Faibussowitsch { 126076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 1295519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 131076ba34aSJunchao Zhang } 132076ba34aSJunchao Zhang 133d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 134d71ae5a4SJacob Faibussowitsch { 135076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 136076ba34aSJunchao Zhang 137076ba34aSJunchao Zhang PetscFunctionBegin; 1385519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 139076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 140076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1412328674fSJunchao Zhang } else { 1422328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1432328674fSJunchao Zhang } 1443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145076ba34aSJunchao Zhang } 146076ba34aSJunchao Zhang 147d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 148d71ae5a4SJacob Faibussowitsch { 149076ba34aSJunchao Zhang PetscFunctionBegin; 150076ba34aSJunchao Zhang *array = NULL; 1513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 152076ba34aSJunchao Zhang } 153076ba34aSJunchao Zhang 154d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155d71ae5a4SJacob Faibussowitsch { 156076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157076ba34aSJunchao Zhang 158076ba34aSJunchao Zhang PetscFunctionBegin; 1595519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 160076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1612328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1622328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1632328674fSJunchao Zhang } 1643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 165076ba34aSJunchao Zhang } 166076ba34aSJunchao Zhang 167d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 168d71ae5a4SJacob Faibussowitsch { 169076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 170076ba34aSJunchao Zhang 171076ba34aSJunchao Zhang PetscFunctionBegin; 1725519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 173076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 174076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1752328674fSJunchao Zhang } 1763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177f0cf5187SStefano Zampini } 178f0cf5187SStefano Zampini 179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 180d71ae5a4SJacob Faibussowitsch { 1817ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1827ee59b9bSJunchao Zhang 1837ee59b9bSJunchao Zhang PetscFunctionBegin; 1847ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1857ee59b9bSJunchao Zhang 1867ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1877ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1887ee59b9bSJunchao Zhang if (a) { 1897ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1907ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1917ee59b9bSJunchao Zhang } 1927ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1947ee59b9bSJunchao Zhang } 1957ee59b9bSJunchao Zhang 1960e3ece09SJunchao Zhang /* 1970e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 1980e3ece09SJunchao Zhang 1990e3ece09SJunchao Zhang Input Parameter: 2000e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2010e3ece09SJunchao Zhang 2020e3ece09SJunchao Zhang Output Parameters: 2030e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 204aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2050e3ece09SJunchao Zhang */ 2060e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 207d71ae5a4SJacob Faibussowitsch { 2080e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2090e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2100e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 211*7b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2120e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 213*7b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 214*7b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2150e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2160e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2170e3ece09SJunchao Zhang PetscInt *offset; 218152b3e56SJunchao Zhang 219152b3e56SJunchao Zhang PetscFunctionBegin; 2200e3ece09SJunchao Zhang // Populate Ti 2210e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2220e3ece09SJunchao Zhang Ti++; 2230e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2240e3ece09SJunchao Zhang Ti--; 2250e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2260e3ece09SJunchao Zhang 2270e3ece09SJunchao Zhang // Populate Tj and the permutation array 2280e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2290e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2300e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2310e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2320e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2330e3ece09SJunchao Zhang 2340e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2350e3ece09SJunchao Zhang perm[disp] = j; 2360e3ece09SJunchao Zhang offset[r]++; 237076ba34aSJunchao Zhang } 2380e3ece09SJunchao Zhang } 2390e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2400e3ece09SJunchao Zhang 2410e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2420e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2430e3ece09SJunchao Zhang 2440e3ece09SJunchao Zhang // Output perm and T on device 2450e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2460e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2470e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2480e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 250152b3e56SJunchao Zhang } 251152b3e56SJunchao Zhang 2520e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2530e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2540e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 255d71ae5a4SJacob Faibussowitsch { 2560e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2570e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2580e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2590e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 260152b3e56SJunchao Zhang 261152b3e56SJunchao Zhang PetscFunctionBegin; 2620e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 2630e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 2640e3ece09SJunchao Zhang 2650e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2660e3ece09SJunchao Zhang 2670e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2680e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 2690e3ece09SJunchao Zhang } else { 2700e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 2710e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 2720e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 2730e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 2740e3ece09SJunchao Zhang auto &Ta = T.values; 2750e3ece09SJunchao Zhang 2760e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 2770e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 278076ba34aSJunchao Zhang } 2790e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 2800e3ece09SJunchao Zhang MatRowMapKokkosView perm; 2810e3ece09SJunchao Zhang 2820e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 2830e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 2840e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 2850e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 2860e3ece09SJunchao Zhang } 2870e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 2880e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 2890e3ece09SJunchao Zhang } 2900e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2910e3ece09SJunchao Zhang } 2920e3ece09SJunchao Zhang 2930e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 2940e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 2950e3ece09SJunchao Zhang { 2960e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2970e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2980e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2990e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3000e3ece09SJunchao Zhang 3010e3ece09SJunchao Zhang PetscFunctionBegin; 3020e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3030e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3040e3ece09SJunchao Zhang 3050e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3060e3ece09SJunchao Zhang 3070e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3080e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3090e3ece09SJunchao Zhang } else { 3100e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3110e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3120e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3130e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3140e3ece09SJunchao Zhang auto &Ta = T.values; 3150e3ece09SJunchao Zhang 3160e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3170e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3180e3ece09SJunchao Zhang } 3190e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3200e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3210e3ece09SJunchao Zhang 3220e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3230e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 3240e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3250e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3260e3ece09SJunchao Zhang } 3270e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3280e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3290e3ece09SJunchao Zhang } 3303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 331152b3e56SJunchao Zhang } 332a587d139SMark 3338c3ff71bSJunchao Zhang /* y = A x */ 334d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 335d71ae5a4SJacob Faibussowitsch { 3368c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 337152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 338152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3398c3ff71bSJunchao Zhang 3408c3ff71bSJunchao Zhang PetscFunctionBegin; 3419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3439566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3449566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3458c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3469d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3479566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3489566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 349076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3538c3ff71bSJunchao Zhang } 3548c3ff71bSJunchao Zhang 3558c3ff71bSJunchao Zhang /* y = A^T x */ 356d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 357d71ae5a4SJacob Faibussowitsch { 3588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 359152b3e56SJunchao Zhang const char *mode; 360152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 361152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3620e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3638c3ff71bSJunchao Zhang 3648c3ff71bSJunchao Zhang PetscFunctionBegin; 3659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3679566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3689566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 369152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 371152b3e56SJunchao Zhang mode = "N"; 372152b3e56SJunchao Zhang } else { 373076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3740e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 375152b3e56SJunchao Zhang mode = "T"; 376152b3e56SJunchao Zhang } 3770e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3789566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3800e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 3819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3838c3ff71bSJunchao Zhang } 3848c3ff71bSJunchao Zhang 3858c3ff71bSJunchao Zhang /* y = A^H x */ 386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387d71ae5a4SJacob Faibussowitsch { 3888c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 389152b3e56SJunchao Zhang const char *mode; 390152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 391152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3920e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3938c3ff71bSJunchao Zhang 3948c3ff71bSJunchao Zhang PetscFunctionBegin; 3959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3979566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3989566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 399152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 401152b3e56SJunchao Zhang mode = "N"; 402152b3e56SJunchao Zhang } else { 403076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4040e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 405152b3e56SJunchao Zhang mode = "C"; 406152b3e56SJunchao Zhang } 4070e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4089566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4099566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4100e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4138c3ff71bSJunchao Zhang } 4148c3ff71bSJunchao Zhang 4158c3ff71bSJunchao Zhang /* z = A x + y */ 416d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 417d71ae5a4SJacob Faibussowitsch { 4188c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 419152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 420152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4218c3ff71bSJunchao Zhang 4228c3ff71bSJunchao Zhang PetscFunctionBegin; 4239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4259566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4269566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4279566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4288c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 4298c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4309d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4319566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4329566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4339566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4378c3ff71bSJunchao Zhang } 4388c3ff71bSJunchao Zhang 4398c3ff71bSJunchao Zhang /* z = A^T x + y */ 440d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 441d71ae5a4SJacob Faibussowitsch { 4428c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 443152b3e56SJunchao Zhang const char *mode; 444152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 445152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4460e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4478c3ff71bSJunchao Zhang 4488c3ff71bSJunchao Zhang PetscFunctionBegin; 4499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4519566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4529566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4539566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4548c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 455152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 457152b3e56SJunchao Zhang mode = "N"; 458152b3e56SJunchao Zhang } else { 459076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4600e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 461152b3e56SJunchao Zhang mode = "T"; 462152b3e56SJunchao Zhang } 4630e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4669566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4670e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4708c3ff71bSJunchao Zhang } 4718c3ff71bSJunchao Zhang 4728c3ff71bSJunchao Zhang /* z = A^H x + y */ 473d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 474d71ae5a4SJacob Faibussowitsch { 4758c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 476152b3e56SJunchao Zhang const char *mode; 477152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 478152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4790e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4808c3ff71bSJunchao Zhang 4818c3ff71bSJunchao Zhang PetscFunctionBegin; 4829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4849566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4859566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4869566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4878c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 488152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 490152b3e56SJunchao Zhang mode = "N"; 491152b3e56SJunchao Zhang } else { 492076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4930e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 494152b3e56SJunchao Zhang mode = "C"; 495152b3e56SJunchao Zhang } 4960e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 4979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4989566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4999566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 5000e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 503152b3e56SJunchao Zhang } 504152b3e56SJunchao Zhang 505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 506d71ae5a4SJacob Faibussowitsch { 507152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508152b3e56SJunchao Zhang 509152b3e56SJunchao Zhang PetscFunctionBegin; 510152b3e56SJunchao Zhang switch (op) { 511152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 512152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5139566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 514152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 515152b3e56SJunchao Zhang break; 516d71ae5a4SJacob Faibussowitsch default: 517d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 518d71ae5a4SJacob Faibussowitsch break; 519152b3e56SJunchao Zhang } 5203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5218c3ff71bSJunchao Zhang } 5228c3ff71bSJunchao Zhang 523076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 524d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 525d71ae5a4SJacob Faibussowitsch { 526076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5278c3ff71bSJunchao Zhang 5288c3ff71bSJunchao Zhang PetscFunctionBegin; 5299566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 530076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5319566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5328c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5339566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 534076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5355f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5369566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5379566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5389566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5399566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 540076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 541394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5425f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 543076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 5448c3ff71bSJunchao Zhang } 545076ba34aSJunchao Zhang } 5463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5478c3ff71bSJunchao Zhang } 5488c3ff71bSJunchao Zhang 549076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 550076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 551076ba34aSJunchao Zhang */ 552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 553d71ae5a4SJacob Faibussowitsch { 554076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 555076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 556076ba34aSJunchao Zhang Mat mat; 5578c3ff71bSJunchao Zhang 5588c3ff71bSJunchao Zhang PetscFunctionBegin; 559076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5609566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 561076ba34aSJunchao Zhang mat = *B; 562076ba34aSJunchao Zhang if (A->assembled) { 563076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 564076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 565076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 566076ba34aSJunchao Zhang /* Now copy values to B if needed */ 567076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 568076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 569076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 570076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 571076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 572076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 573076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 574076ba34aSJunchao Zhang } 575076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 576076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 577076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 578076ba34aSJunchao Zhang } 579076ba34aSJunchao Zhang mat->spptr = bkok; 580076ba34aSJunchao Zhang } 581076ba34aSJunchao Zhang 5829566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5839566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5849566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5859566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5878c3ff71bSJunchao Zhang } 5888c3ff71bSJunchao Zhang 589d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 590d71ae5a4SJacob Faibussowitsch { 5910ecb592aSJunchao Zhang Mat At; 5920e3ece09SJunchao Zhang KokkosCsrMatrix internT; 5930ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 5940ecb592aSJunchao Zhang 5950ecb592aSJunchao Zhang PetscFunctionBegin; 5967fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 5979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 5980ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 599ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6000e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6019566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6020ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6039566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6040ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6050ecb592aSJunchao Zhang if ((*B)->assembled) { 6060ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6070e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6090ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6100ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6110e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6120e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6130e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6140e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6150ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6160ecb592aSJunchao Zhang } 6173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6180ecb592aSJunchao Zhang } 6190ecb592aSJunchao Zhang 620d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 621d71ae5a4SJacob Faibussowitsch { 62286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6238c3ff71bSJunchao Zhang 6248c3ff71bSJunchao Zhang PetscFunctionBegin; 62586a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 62686a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6278c3ff71bSJunchao Zhang delete aijkok; 62886a27549SJunchao Zhang } else { 62986a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 63086a27549SJunchao Zhang } 631cbc6b225SStefano Zampini A->spptr = NULL; 6329566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6339566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 6359566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6378c3ff71bSJunchao Zhang } 6388c3ff71bSJunchao Zhang 6393f3ba80aSJunchao Zhang /*MC 6403f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6413f3ba80aSJunchao Zhang 6423f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 6433f3ba80aSJunchao Zhang 6442ef1f0ffSBarry Smith Options Database Key: 64511a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6463f3ba80aSJunchao Zhang 6473f3ba80aSJunchao Zhang Level: beginner 6483f3ba80aSJunchao Zhang 6492ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6503f3ba80aSJunchao Zhang M*/ 651d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 652d71ae5a4SJacob Faibussowitsch { 65386a27549SJunchao Zhang PetscFunctionBegin; 6549566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6559566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6569566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 65886a27549SJunchao Zhang } 65986a27549SJunchao Zhang 660076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 661d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 662d71ae5a4SJacob Faibussowitsch { 663076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 664076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 665076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 666076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 667076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 668076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 669a3f881fbSStefano Zampini 670a3f881fbSStefano Zampini PetscFunctionBegin; 671076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 672076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 673076ba34aSJunchao Zhang PetscValidPointer(C, 4); 674076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 675076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6765f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6775f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 678076ba34aSJunchao Zhang 6799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 681076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 682076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 683076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 684076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 685076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 686076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 687076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 688076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 689076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 690076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 691076ba34aSJunchao Zhang nnz = a->nz + b->nz; 692076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 693076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 694076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 695076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 696076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 697076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 698076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 699076ba34aSJunchao Zhang ca = ca_dual.view_device(); 700076ba34aSJunchao Zhang ci = ci_dual.view_device(); 701076ba34aSJunchao Zhang cj = cj_dual.view_device(); 702076ba34aSJunchao Zhang 703076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7049371c9d4SSatish Balay Kokkos::parallel_for( 7059371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 706076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 707076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 708076ba34aSJunchao Zhang 709076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 710076ba34aSJunchao Zhang ci(i) = coffset; 711076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 712076ba34aSJunchao Zhang }); 713076ba34aSJunchao Zhang 714076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 715076ba34aSJunchao Zhang if (k < alen) { 716076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 717076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 718076ba34aSJunchao Zhang } else { 719076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 720076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 721076ba34aSJunchao Zhang } 722076ba34aSJunchao Zhang }); 723076ba34aSJunchao Zhang }); 724076ba34aSJunchao Zhang ca_dual.modify_device(); 725076ba34aSJunchao Zhang ci_dual.modify_device(); 726076ba34aSJunchao Zhang cj_dual.modify_device(); 7279566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7289566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 729076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 730076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 731076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 732076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 733076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 734076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 735076ba34aSJunchao Zhang 7369371c9d4SSatish Balay Kokkos::parallel_for( 7379371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 738076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 739076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 740076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 742076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 743076ba34aSJunchao Zhang }); 744076ba34aSJunchao Zhang }); 7459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 746076ba34aSJunchao Zhang } 7473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748076ba34aSJunchao Zhang } 749076ba34aSJunchao Zhang 750d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 751d71ae5a4SJacob Faibussowitsch { 752076ba34aSJunchao Zhang PetscFunctionBegin; 753076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 755a3f881fbSStefano Zampini } 756a3f881fbSStefano Zampini 757d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 758d71ae5a4SJacob Faibussowitsch { 759a3f881fbSStefano Zampini Mat_Product *product = C->product; 760a3f881fbSStefano Zampini Mat A, B; 761076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 762a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 763a3f881fbSStefano Zampini Mat_SeqAIJ *c; 764076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7650e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 766a3f881fbSStefano Zampini 767a3f881fbSStefano Zampini PetscFunctionBegin; 768a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7695f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 770076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 771076ba34aSJunchao Zhang 7720e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 7730e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 7740e3ece09SJunchao Zhang // we still do numeric. 7750e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 7760e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 7773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 778076ba34aSJunchao Zhang } 779076ba34aSJunchao Zhang 780076ba34aSJunchao Zhang switch (product->type) { 7819371c9d4SSatish Balay case MATPRODUCT_AB: 7829371c9d4SSatish Balay transA = false; 7839371c9d4SSatish Balay transB = false; 7849371c9d4SSatish Balay break; 7859371c9d4SSatish Balay case MATPRODUCT_AtB: 7869371c9d4SSatish Balay transA = true; 7879371c9d4SSatish Balay transB = false; 7889371c9d4SSatish Balay break; 7899371c9d4SSatish Balay case MATPRODUCT_ABt: 7909371c9d4SSatish Balay transA = false; 7919371c9d4SSatish Balay transB = true; 7929371c9d4SSatish Balay break; 793d71ae5a4SJacob Faibussowitsch default: 794d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 795076ba34aSJunchao Zhang } 796076ba34aSJunchao Zhang 797a3f881fbSStefano Zampini A = product->A; 798a3f881fbSStefano Zampini B = product->B; 7999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 801a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 802a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 803a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 804076ba34aSJunchao Zhang 8055f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 806076ba34aSJunchao Zhang 8070e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8080e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 809076ba34aSJunchao Zhang 810076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 811076ba34aSJunchao Zhang if (transA) { 8129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 813076ba34aSJunchao Zhang transA = false; 814a3f881fbSStefano Zampini } 815a3f881fbSStefano Zampini 816076ba34aSJunchao Zhang if (transB) { 8179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 818076ba34aSJunchao Zhang transB = false; 819076ba34aSJunchao Zhang } 8209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8210e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 823866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 824866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 825e944a159SJunchao Zhang #endif 826866eb059SJunchao Zhang 8279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 829a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 830a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8319566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8329566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8339566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 834a3f881fbSStefano Zampini c->reallocs = 0; 835076ba34aSJunchao Zhang C->info.mallocs = 0; 836a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 837a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 838a3f881fbSStefano Zampini C->num_ass++; 8393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 840a3f881fbSStefano Zampini } 841a3f881fbSStefano Zampini 842d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 843d71ae5a4SJacob Faibussowitsch { 844076ba34aSJunchao Zhang Mat_Product *product = C->product; 845076ba34aSJunchao Zhang MatProductType ptype; 846076ba34aSJunchao Zhang Mat A, B; 847076ba34aSJunchao Zhang bool transA, transB; 848076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 849076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 850076ba34aSJunchao Zhang MPI_Comm comm; 8510e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 852a3f881fbSStefano Zampini 853a3f881fbSStefano Zampini PetscFunctionBegin; 854a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8559566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8565f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 857a3f881fbSStefano Zampini A = product->A; 858a3f881fbSStefano Zampini B = product->B; 8599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 861a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8630e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8640e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 865076ba34aSJunchao Zhang 866a3f881fbSStefano Zampini ptype = product->type; 8670e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8680e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 8690e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8700e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 8710e3ece09SJunchao Zhang } 8720e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 8730e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8740e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 8750e3ece09SJunchao Zhang } 8760e3ece09SJunchao Zhang 877a3f881fbSStefano Zampini switch (ptype) { 8789371c9d4SSatish Balay case MATPRODUCT_AB: 8799371c9d4SSatish Balay transA = false; 8809371c9d4SSatish Balay transB = false; 8819371c9d4SSatish Balay break; 8829371c9d4SSatish Balay case MATPRODUCT_AtB: 8839371c9d4SSatish Balay transA = true; 8849371c9d4SSatish Balay transB = false; 8859371c9d4SSatish Balay break; 8869371c9d4SSatish Balay case MATPRODUCT_ABt: 8879371c9d4SSatish Balay transA = false; 8889371c9d4SSatish Balay transB = true; 8899371c9d4SSatish Balay break; 890d71ae5a4SJacob Faibussowitsch default: 891d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 892a3f881fbSStefano Zampini } 8930e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 894076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 895a3f881fbSStefano Zampini 896076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 897866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 898866eb059SJunchao Zhang 899866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 900866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 901866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 902866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 903866eb059SJunchao Zhang #endif 904866eb059SJunchao Zhang #endif 9050e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 906076ba34aSJunchao Zhang 9079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 908076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 909076ba34aSJunchao Zhang if (transA) { 9109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 911076ba34aSJunchao Zhang transA = false; 912076ba34aSJunchao Zhang } 913076ba34aSJunchao Zhang 914076ba34aSJunchao Zhang if (transB) { 9159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 916076ba34aSJunchao Zhang transB = false; 917076ba34aSJunchao Zhang } 918076ba34aSJunchao Zhang 9190e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 920076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 921076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 922076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 923076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 924076ba34aSJunchao Zhang */ 9250e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9260e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 927866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 928866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 929866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 930e944a159SJunchao Zhang #endif 9319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 932076ba34aSJunchao Zhang 9339566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9349566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 935076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 937a3f881fbSStefano Zampini } 938a3f881fbSStefano Zampini 939a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 940d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 941d71ae5a4SJacob Faibussowitsch { 942076ba34aSJunchao Zhang Mat_Product *product = mat->product; 943a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 944a3f881fbSStefano Zampini 945a3f881fbSStefano Zampini PetscFunctionBegin; 946a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9479566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 94848a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 949a3f881fbSStefano Zampini if (Biskok && Ciskok) { 950a3f881fbSStefano Zampini switch (product->type) { 951a3f881fbSStefano Zampini case MATPRODUCT_AB: 952a3f881fbSStefano Zampini case MATPRODUCT_AtB: 953d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 954d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 955d71ae5a4SJacob Faibussowitsch break; 956a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 957a3f881fbSStefano Zampini case MATPRODUCT_RARt: 958d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 959d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 960d71ae5a4SJacob Faibussowitsch break; 961d71ae5a4SJacob Faibussowitsch default: 962d71ae5a4SJacob Faibussowitsch break; 963a3f881fbSStefano Zampini } 964a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9659566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 966a3f881fbSStefano Zampini } 9673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 968a3f881fbSStefano Zampini } 969a587d139SMark 970d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 971d71ae5a4SJacob Faibussowitsch { 972f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 973f0cf5187SStefano Zampini 974f0cf5187SStefano Zampini PetscFunctionBegin; 9759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 977f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 978076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 9819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 9823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 983f0cf5187SStefano Zampini } 984f0cf5187SStefano Zampini 985d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 986d71ae5a4SJacob Faibussowitsch { 987076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 988a587d139SMark 989a587d139SMark PetscFunctionBegin; 990076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 9912328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 992076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 9939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9942328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 9959566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 9962328674fSJunchao Zhang } 9973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 998a587d139SMark } 999a587d139SMark 1000d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1001d71ae5a4SJacob Faibussowitsch { 1002f78ce678SMark Adams Mat_SeqAIJ *aijseq; 1003f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1004f78ce678SMark Adams PetscInt n; 1005f78ce678SMark Adams PetscScalarKokkosView xv; 1006f78ce678SMark Adams 1007f78ce678SMark Adams PetscFunctionBegin; 1008f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1009f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1010f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1011f78ce678SMark Adams 1012f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014f78ce678SMark Adams 1015f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1016f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1017f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1018f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 1019f78ce678SMark Adams } 1020f78ce678SMark Adams 1021f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1022f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1023f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1024f78ce678SMark Adams 1025f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 10269371c9d4SSatish Balay Kokkos::parallel_for( 10279371c9d4SSatish Balay n, KOKKOS_LAMBDA(const PetscInt i) { 1028f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1029f78ce678SMark Adams else xv(i) = 0; 1030f78ce678SMark Adams }); 1031f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 10323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1033f78ce678SMark Adams } 1034f78ce678SMark Adams 1035db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1036d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1037d71ae5a4SJacob Faibussowitsch { 1038db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1039db78de30SJunchao Zhang 1040db78de30SJunchao Zhang PetscFunctionBegin; 1041db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1042db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1043db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1048db78de30SJunchao Zhang } 1049db78de30SJunchao Zhang 1050d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1051d71ae5a4SJacob Faibussowitsch { 1052db78de30SJunchao Zhang PetscFunctionBegin; 1053db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1054db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1055db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1057db78de30SJunchao Zhang } 1058db78de30SJunchao Zhang 1059d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1060d71ae5a4SJacob Faibussowitsch { 1061db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1062db78de30SJunchao Zhang 1063db78de30SJunchao Zhang PetscFunctionBegin; 1064db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1065db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1066db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1068db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1069076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1071db78de30SJunchao Zhang } 1072db78de30SJunchao Zhang 1073d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1074d71ae5a4SJacob Faibussowitsch { 1075db78de30SJunchao Zhang PetscFunctionBegin; 1076db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1077db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1078db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1081db78de30SJunchao Zhang } 1082db78de30SJunchao Zhang 1083d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1084d71ae5a4SJacob Faibussowitsch { 1085db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1086db78de30SJunchao Zhang 1087db78de30SJunchao Zhang PetscFunctionBegin; 1088db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1089db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1090db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1091db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1094db78de30SJunchao Zhang } 1095db78de30SJunchao Zhang 1096d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1097d71ae5a4SJacob Faibussowitsch { 1098db78de30SJunchao Zhang PetscFunctionBegin; 1099db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1100db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1101db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1104db78de30SJunchao Zhang } 1105db78de30SJunchao Zhang 1106c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1108d71ae5a4SJacob Faibussowitsch { 1109a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1110c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1111c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1112c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1113a587d139SMark 1114a587d139SMark PetscFunctionBegin; 1115c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1116c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 11179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 11189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 11199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1120db78de30SJunchao Zhang 1121c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1122a587d139SMark PetscBool e; 11239566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1124a587d139SMark if (e) { 11259566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1126c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1127a587d139SMark } 1128a587d139SMark } 1129db78de30SJunchao Zhang 1130c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1131c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1132c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1133c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1134c17cf699SJunchao Zhang */ 1135c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1136c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1137c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1138c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1139c17cf699SJunchao Zhang 1140c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1141c17cf699SJunchao Zhang KokkosBlas::axpy(alpha, Xa, Ya); 11429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1143c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1144c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1145c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1146c17cf699SJunchao Zhang 11479371c9d4SSatish Balay Kokkos::parallel_for( 11489371c9d4SSatish Balay Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 11490e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 11500e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 11510e3ece09SJunchao Zhang // Only one thread works in a team 1152c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 11530e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 11540e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 11550e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1156c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1157c17cf699SJunchao Zhang q++; 1158a587d139SMark } else { 11590e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 11600e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 11610e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 11620e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 11638b8b16f9SJunchao Zhang #else 11640e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 11658b8b16f9SJunchao Zhang #endif 1166a587d139SMark } 1167c17cf699SJunchao Zhang } 1168c17cf699SJunchao Zhang }); 1169c17cf699SJunchao Zhang }); 11709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 11710e3ece09SJunchao Zhang } else { // different nonzero patterns 1172c17cf699SJunchao Zhang Mat Z; 1173c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1174c17cf699SJunchao Zhang KernelHandle kh; 11750e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1176c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1177c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1178c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 11799566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 11809566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1181c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1182c17cf699SJunchao Zhang } 11839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 11840e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 11853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1186a587d139SMark } 1187a587d139SMark 1188d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1189d71ae5a4SJacob Faibussowitsch { 119042550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 119142550becSJunchao Zhang Mat_SeqAIJ *aseq; 119242550becSJunchao Zhang 119342550becSJunchao Zhang PetscFunctionBegin; 11949566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1195394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 119642550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1197cbc6b225SStefano Zampini delete akok; 1198cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 11999566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1200394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 12013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 120242550becSJunchao Zhang } 120342550becSJunchao Zhang 1204d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1205d71ae5a4SJacob Faibussowitsch { 120642550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 120742550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1208394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1209394ed5ebSJunchao Zhang const PetscCountKokkosView &jmap = akok->jmap_d; 1210394ed5ebSJunchao Zhang const PetscCountKokkosView &perm = akok->perm_d; 121142550becSJunchao Zhang MatScalarKokkosView Aa; 121242550becSJunchao Zhang ConstMatScalarKokkosView kv; 121342550becSJunchao Zhang PetscMemType memtype; 121442550becSJunchao Zhang 121542550becSJunchao Zhang PetscFunctionBegin; 12169566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 121742550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1218394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 121942550becSJunchao Zhang } else { 1220394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 122142550becSJunchao Zhang } 122242550becSJunchao Zhang 1223c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1224c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 122542550becSJunchao Zhang 12269371c9d4SSatish Balay Kokkos::parallel_for( 12279371c9d4SSatish Balay Annz, KOKKOS_LAMBDA(const PetscCount i) { 1228c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1229c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1230c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1231c7b718f4SJunchao Zhang }); 1232394ed5ebSJunchao Zhang 12339566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 12349566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 12353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 123642550becSJunchao Zhang } 123742550becSJunchao Zhang 1238d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1239d71ae5a4SJacob Faibussowitsch { 12405fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 12415fbaff96SJunchao Zhang MatScalarKokkosView Aa; 12425fbaff96SJunchao Zhang const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 12435fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 12445fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 12455fbaff96SJunchao Zhang 12465fbaff96SJunchao Zhang PetscFunctionBegin; 12475fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 12489371c9d4SSatish Balay Kokkos::parallel_for( 12499371c9d4SSatish Balay m, KOKKOS_LAMBDA(const PetscInt i) { 12505fbaff96SJunchao Zhang PetscScalar tmp; 12515fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 12525fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 12535fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 12545fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 12555fbaff96SJunchao Zhang } 12565fbaff96SJunchao Zhang }); 12575fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 12583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12595fbaff96SJunchao Zhang } 12605fbaff96SJunchao Zhang 1261d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1262d71ae5a4SJacob Faibussowitsch { 12638f7e8f9dSMark Adams PetscFunctionBegin; 12649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 12659566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 12668f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 12673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12688f7e8f9dSMark Adams } 12698f7e8f9dSMark Adams 1270d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1271d71ae5a4SJacob Faibussowitsch { 1272076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1273076ba34aSJunchao Zhang 12748c3ff71bSJunchao Zhang PetscFunctionBegin; 1275076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 12766f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 12776f3d89d0SStefano Zampini 12788c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 12798c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 12808c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1281a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1282f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1283a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1284076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 12858c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 12868c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 12878c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 12888c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 12898c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 12908c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1291076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 12920ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1293152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1294f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1295076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1296076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1297076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1298076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1299076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1300076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 13017ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 130242550becSJunchao Zhang 13039566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 13049566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 13053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1306076ba34aSJunchao Zhang } 1307076ba34aSJunchao Zhang 1308d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1309d71ae5a4SJacob Faibussowitsch { 1310076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1311076ba34aSJunchao Zhang PetscInt i, m, n; 1312076ba34aSJunchao Zhang 1313076ba34aSJunchao Zhang PetscFunctionBegin; 13145f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1315076ba34aSJunchao Zhang 1316076ba34aSJunchao Zhang m = akok->nrows(); 1317076ba34aSJunchao Zhang n = akok->ncols(); 13189566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 13199566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1320076ba34aSJunchao Zhang 1321076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 13229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1323076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ *)(A)->data; 1324076ba34aSJunchao Zhang 1325076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1326076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1327076ba34aSJunchao Zhang 1328076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1329076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1330076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1331076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1332076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1333076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1334076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1335076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1336076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1337076ba34aSJunchao Zhang 13389566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 13399566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1340ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1341076ba34aSJunchao Zhang 1342076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1343076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1344ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 13459566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 13469566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 13473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1348076ba34aSJunchao Zhang } 1349076ba34aSJunchao Zhang 13500e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 13510e3ece09SJunchao Zhang { 13520e3ece09SJunchao Zhang PetscFunctionBegin; 13530e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13540e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 13550e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13560e3ece09SJunchao Zhang } 13570e3ece09SJunchao Zhang 13580e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 13590e3ece09SJunchao Zhang { 13600e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 13610e3ece09SJunchao Zhang PetscFunctionBegin; 13620e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 13630e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 13640e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 13650e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13660e3ece09SJunchao Zhang } 13670e3ece09SJunchao Zhang 1368076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1369076ba34aSJunchao Zhang 1370076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1371076ba34aSJunchao Zhang */ 1372d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1373d71ae5a4SJacob Faibussowitsch { 1374076ba34aSJunchao Zhang PetscFunctionBegin; 13759566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 13769566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 13773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13788c3ff71bSJunchao Zhang } 13798c3ff71bSJunchao Zhang 1380152b3e56SJunchao Zhang /*@C 138111a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 13828c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 138320f4b53cSBarry Smith Kokkos for calculations. 13848c3ff71bSJunchao Zhang 13858c3ff71bSJunchao Zhang Collective 13868c3ff71bSJunchao Zhang 13878c3ff71bSJunchao Zhang Input Parameters: 138811a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 13898c3ff71bSJunchao Zhang . m - number of rows 13908c3ff71bSJunchao Zhang . n - number of columns 139120f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 139220f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 13938c3ff71bSJunchao Zhang 13948c3ff71bSJunchao Zhang Output Parameter: 13958c3ff71bSJunchao Zhang . A - the matrix 13968c3ff71bSJunchao Zhang 13972ef1f0ffSBarry Smith Level: intermediate 13982ef1f0ffSBarry Smith 13992ef1f0ffSBarry Smith Notes: 140011a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 14018c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 140211a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 14038c3ff71bSJunchao Zhang 140411a5261eSBarry Smith The AIJ format, also called 14052ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 14068c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 140720f4b53cSBarry Smith either one (as in Fortran) or zero. 14088c3ff71bSJunchao Zhang 14092ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 14102ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 14112ef1f0ffSBarry Smith allocation. 14128c3ff71bSJunchao Zhang 14132ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 14148c3ff71bSJunchao Zhang @*/ 1415d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1416d71ae5a4SJacob Faibussowitsch { 14178c3ff71bSJunchao Zhang PetscFunctionBegin; 14189566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 14199566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 14209566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 14219566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 14229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 14233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14248c3ff71bSJunchao Zhang } 1425930e68a5SMark Adams 1426d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1427d71ae5a4SJacob Faibussowitsch { 1428930e68a5SMark Adams PetscFunctionBegin; 14299566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 143086a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 14313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 143286a27549SJunchao Zhang } 143386a27549SJunchao Zhang 1434d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1435d71ae5a4SJacob Faibussowitsch { 143686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 143786a27549SJunchao Zhang 143886a27549SJunchao Zhang PetscFunctionBegin; 143986a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 144086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 144186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 144286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 144386a27549SJunchao Zhang } 14443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 144586a27549SJunchao Zhang } 144686a27549SJunchao Zhang 144786a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1448d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1449d71ae5a4SJacob Faibussowitsch { 145086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1451076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 145286a27549SJunchao Zhang 145386a27549SJunchao Zhang PetscFunctionBegin; 145486a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 145586a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1456*7b8d4ba6SJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 1457*7b8d4ba6SJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 1458*7b8d4ba6SJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 145986a27549SJunchao Zhang 14609371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 146186a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 146286a27549SJunchao Zhang 146386a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 146486a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 146586a27549SJunchao Zhang */ 14669371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 146786a27549SJunchao Zhang 146886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 146986a27549SJunchao Zhang 147086a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1471*7b8d4ba6SJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 1472*7b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 1473*7b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 147486a27549SJunchao Zhang 14759371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 147686a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 147786a27549SJunchao Zhang 147886a27549SJunchao Zhang /* Sort indices. See comments above */ 14799371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 148086a27549SJunchao Zhang 148186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 148286a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 148386a27549SJunchao Zhang } 14843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148586a27549SJunchao Zhang } 148686a27549SJunchao Zhang 148786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1488d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1489d71ae5a4SJacob Faibussowitsch { 149086a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 149186a27549SJunchao Zhang PetscScalarKokkosView xv; 149286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 149386a27549SJunchao Zhang 149486a27549SJunchao Zhang PetscFunctionBegin; 14959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 14969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 14979566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 14989566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 149986a27549SJunchao Zhang /* Solve L tmpv = b */ 15009566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 150186a27549SJunchao Zhang /* Solve Ux = tmpv */ 15029566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 15039566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 15049566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 15059566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 150786a27549SJunchao Zhang } 150886a27549SJunchao Zhang 1509076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1510d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1511d71ae5a4SJacob Faibussowitsch { 151286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 151386a27549SJunchao Zhang PetscScalarKokkosView xv; 151486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 151586a27549SJunchao Zhang 151686a27549SJunchao Zhang PetscFunctionBegin; 15179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 15199566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 15209566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 152186a27549SJunchao Zhang /* Solve U^T tmpv = b */ 152286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 152386a27549SJunchao Zhang 152486a27549SJunchao Zhang /* Solve L^T x = tmpv */ 152586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 15269566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 15279566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 15289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 153086a27549SJunchao Zhang } 153186a27549SJunchao Zhang 1532d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1533d71ae5a4SJacob Faibussowitsch { 153486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 153586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 153686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 153786a27549SJunchao Zhang 153886a27549SJunchao Zhang PetscFunctionBegin; 15399566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15409566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1541076ba34aSJunchao Zhang 1542076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1543076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1544076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1545076ba34aSJunchao Zhang 1546076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 154786a27549SJunchao Zhang 154886a27549SJunchao Zhang B->assembled = PETSC_TRUE; 154986a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 155086a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 155186a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 155286a27549SJunchao Zhang B->ops->matsolve = NULL; 155386a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 155486a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 155586a27549SJunchao Zhang 155686a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 155786a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 155886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1559eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 15609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 156286a27549SJunchao Zhang } 156386a27549SJunchao Zhang 1564d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1565d71ae5a4SJacob Faibussowitsch { 156686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 156786a27549SJunchao Zhang Mat_SeqAIJ *b; 156886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 156986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 157086a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 157186a27549SJunchao Zhang PetscInt n = A->rmap->n; 157286a27549SJunchao Zhang 157386a27549SJunchao Zhang PetscFunctionBegin; 15749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 157586a27549SJunchao Zhang /* Rebuild factors */ 15769371c9d4SSatish Balay if (factors) { 15779371c9d4SSatish Balay factors->Destroy(); 15789371c9d4SSatish Balay } /* Destroy the old if it exists */ 15799371c9d4SSatish Balay else { 15809371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 15819371c9d4SSatish Balay } 158286a27549SJunchao Zhang 158386a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 158486a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 158586a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 158686a27549SJunchao Zhang 158786a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 158886a27549SJunchao Zhang 158986a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 159086a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 159186a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 159286a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 159386a27549SJunchao Zhang 159486a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1595076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1596076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1597076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 159886a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 159986a27549SJunchao Zhang 160086a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 160186a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 160286a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 160386a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 160486a27549SJunchao Zhang 160586a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 160686a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 160786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 160886a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 160986a27549SJunchao Zhang #else 161086a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 161186a27549SJunchao Zhang #endif 161286a27549SJunchao Zhang 161386a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 161486a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 161586a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 161686a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 161786a27549SJunchao Zhang 161886a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 16199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 162086a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 162186a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 162286a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1623a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 162486a27549SJunchao Zhang 162586a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 162686a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 16273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1628930e68a5SMark Adams } 1629930e68a5SMark Adams 1630d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1631d71ae5a4SJacob Faibussowitsch { 1632930e68a5SMark Adams PetscFunctionBegin; 1633930e68a5SMark Adams *type = MATSOLVERKOKKOS; 16343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1635930e68a5SMark Adams } 1636930e68a5SMark Adams 1637930e68a5SMark Adams /*MC 163886a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 163911a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1640930e68a5SMark Adams 1641930e68a5SMark Adams Level: beginner 1642930e68a5SMark Adams 16432ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1644930e68a5SMark Adams M*/ 164586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1646930e68a5SMark Adams { 1647930e68a5SMark Adams PetscInt n = A->rmap->n; 1648930e68a5SMark Adams 1649930e68a5SMark Adams PetscFunctionBegin; 16509566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 16519566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1652930e68a5SMark Adams (*B)->factortype = ftype; 16539566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 16549566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1655930e68a5SMark Adams 16568f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 16579566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 165886a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 165986a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 166086a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 16619566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 166286a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 166386a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 166498921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1665930e68a5SMark Adams 16669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 16679566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 16683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1669930e68a5SMark Adams } 16708f7e8f9dSMark Adams 1671d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1672d71ae5a4SJacob Faibussowitsch { 167386a27549SJunchao Zhang PetscFunctionBegin; 16749566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 16759566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 16763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 167786a27549SJunchao Zhang } 167886a27549SJunchao Zhang 1679076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1680d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1681d71ae5a4SJacob Faibussowitsch { 1682076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1683076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1684076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1685076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1686076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1687076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1688076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1689076ba34aSJunchao Zhang 1690076ba34aSJunchao Zhang PetscFunctionBegin; 16919566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1692076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 16939566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 169448a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 16959566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1696076ba34aSJunchao Zhang } 16973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1698076ba34aSJunchao Zhang } 1699