111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 188c3ff71bSJunchao Zhang 190e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 23f98996d3SJunchao Zhang #else 24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 27f98996d3SJunchao Zhang #endif 28f98996d3SJunchao Zhang 298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 308c3ff71bSJunchao Zhang 31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 34076ba34aSJunchao Zhang */ 35d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36d71ae5a4SJacob Faibussowitsch { 37076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 38076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 398c3ff71bSJunchao Zhang 408c3ff71bSJunchao Zhang PetscFunctionBegin; 413ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 429566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43076ba34aSJunchao Zhang 44076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46076ba34aSJunchao Zhang 47076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 48076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 50076ba34aSJunchao Zhang */ 51076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52076ba34aSJunchao Zhang delete aijkok; 53076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54076ba34aSJunchao Zhang A->spptr = aijkok; 55076ba34aSJunchao Zhang } 56076ba34aSJunchao Zhang 573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 588c3ff71bSJunchao Zhang } 598c3ff71bSJunchao Zhang 6086a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 61d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 62d71ae5a4SJacob Faibussowitsch { 638c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 648c3ff71bSJunchao Zhang 658c3ff71bSJunchao Zhang PetscFunctionBegin; 66aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 675f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 68076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 69076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 70580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7186a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 728c3ff71bSJunchao Zhang } 733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748c3ff71bSJunchao Zhang } 758c3ff71bSJunchao Zhang 76076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 77d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 78d71ae5a4SJacob Faibussowitsch { 7986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8086a27549SJunchao Zhang 8186a27549SJunchao Zhang PetscFunctionBegin; 825f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8386a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8486a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8586a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 889566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9086a27549SJunchao Zhang } 9186a27549SJunchao Zhang 92d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 93d71ae5a4SJacob Faibussowitsch { 94f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 95f0cf5187SStefano Zampini 96f0cf5187SStefano Zampini PetscFunctionBegin; 97f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9886a27549SJunchao Zhang /* We do not expect one needs factors on host */ 99aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1005f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 101076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 1023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 103f0cf5187SStefano Zampini } 104f0cf5187SStefano Zampini 105d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 106d71ae5a4SJacob Faibussowitsch { 107076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 108f0cf5187SStefano Zampini 109f0cf5187SStefano Zampini PetscFunctionBegin; 1105519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1115519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1125519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1135519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1145519a089SJose E. Roman */ 1155519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 116076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 117076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 118076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 119076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 120076ba34aSJunchao Zhang } 1213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122076ba34aSJunchao Zhang } 123076ba34aSJunchao Zhang 124d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 125d71ae5a4SJacob Faibussowitsch { 126076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 1295519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 131076ba34aSJunchao Zhang } 132076ba34aSJunchao Zhang 133d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 134d71ae5a4SJacob Faibussowitsch { 135076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 136076ba34aSJunchao Zhang 137076ba34aSJunchao Zhang PetscFunctionBegin; 1385519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 139076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 140076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1412328674fSJunchao Zhang } else { 1422328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1432328674fSJunchao Zhang } 1443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145076ba34aSJunchao Zhang } 146076ba34aSJunchao Zhang 147d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 148d71ae5a4SJacob Faibussowitsch { 149076ba34aSJunchao Zhang PetscFunctionBegin; 150076ba34aSJunchao Zhang *array = NULL; 1513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 152076ba34aSJunchao Zhang } 153076ba34aSJunchao Zhang 154d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 155d71ae5a4SJacob Faibussowitsch { 156076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 157076ba34aSJunchao Zhang 158076ba34aSJunchao Zhang PetscFunctionBegin; 1595519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 160076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1612328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1622328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1632328674fSJunchao Zhang } 1643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 165076ba34aSJunchao Zhang } 166076ba34aSJunchao Zhang 167d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 168d71ae5a4SJacob Faibussowitsch { 169076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 170076ba34aSJunchao Zhang 171076ba34aSJunchao Zhang PetscFunctionBegin; 1725519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 173076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 174076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1752328674fSJunchao Zhang } 1763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177f0cf5187SStefano Zampini } 178f0cf5187SStefano Zampini 179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 180d71ae5a4SJacob Faibussowitsch { 1817ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1827ee59b9bSJunchao Zhang 1837ee59b9bSJunchao Zhang PetscFunctionBegin; 1847ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1857ee59b9bSJunchao Zhang 1867ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1877ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1887ee59b9bSJunchao Zhang if (a) { 1897ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1907ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1917ee59b9bSJunchao Zhang } 1927ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1947ee59b9bSJunchao Zhang } 1957ee59b9bSJunchao Zhang 1960e3ece09SJunchao Zhang /* 1970e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 1980e3ece09SJunchao Zhang 1990e3ece09SJunchao Zhang Input Parameter: 2000e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2010e3ece09SJunchao Zhang 2020e3ece09SJunchao Zhang Output Parameters: 2030e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 204aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2050e3ece09SJunchao Zhang */ 2060e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 207d71ae5a4SJacob Faibussowitsch { 2080e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2090e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2100e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2117b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2120e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2137b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2147b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2150e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2160e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2170e3ece09SJunchao Zhang PetscInt *offset; 218152b3e56SJunchao Zhang 219152b3e56SJunchao Zhang PetscFunctionBegin; 2200e3ece09SJunchao Zhang // Populate Ti 2210e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2220e3ece09SJunchao Zhang Ti++; 2230e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2240e3ece09SJunchao Zhang Ti--; 2250e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2260e3ece09SJunchao Zhang 2270e3ece09SJunchao Zhang // Populate Tj and the permutation array 2280e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2290e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2300e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2310e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2320e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2330e3ece09SJunchao Zhang 2340e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2350e3ece09SJunchao Zhang perm[disp] = j; 2360e3ece09SJunchao Zhang offset[r]++; 237076ba34aSJunchao Zhang } 2380e3ece09SJunchao Zhang } 2390e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2400e3ece09SJunchao Zhang 2410e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2420e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2430e3ece09SJunchao Zhang 2440e3ece09SJunchao Zhang // Output perm and T on device 2450e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2460e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2470e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2480e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 250152b3e56SJunchao Zhang } 251152b3e56SJunchao Zhang 2520e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2530e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2540e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 255d71ae5a4SJacob Faibussowitsch { 2560e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2570e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2580e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2590e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 260152b3e56SJunchao Zhang 261152b3e56SJunchao Zhang PetscFunctionBegin; 2620e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 2630e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 2640e3ece09SJunchao Zhang 2650e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2660e3ece09SJunchao Zhang 2670e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2680e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 2690e3ece09SJunchao Zhang } else { 2700e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 2710e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 2720e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 2730e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 2740e3ece09SJunchao Zhang auto &Ta = T.values; 2750e3ece09SJunchao Zhang 2760e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 2770e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 278076ba34aSJunchao Zhang } 2790e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 2800e3ece09SJunchao Zhang MatRowMapKokkosView perm; 2810e3ece09SJunchao Zhang 2820e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 2830e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 2840e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 2850e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 2860e3ece09SJunchao Zhang } 2870e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 2880e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 2890e3ece09SJunchao Zhang } 2900e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 2910e3ece09SJunchao Zhang } 2920e3ece09SJunchao Zhang 2930e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 2940e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 2950e3ece09SJunchao Zhang { 2960e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2970e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2980e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2990e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3000e3ece09SJunchao Zhang 3010e3ece09SJunchao Zhang PetscFunctionBegin; 3020e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3030e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3040e3ece09SJunchao Zhang 3050e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3060e3ece09SJunchao Zhang 3070e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3080e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3090e3ece09SJunchao Zhang } else { 3100e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3110e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3120e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3130e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3140e3ece09SJunchao Zhang auto &Ta = T.values; 3150e3ece09SJunchao Zhang 3160e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3170e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3180e3ece09SJunchao Zhang } 3190e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3200e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3210e3ece09SJunchao Zhang 3220e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3230e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 3240e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3250e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3260e3ece09SJunchao Zhang } 3270e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3280e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3290e3ece09SJunchao Zhang } 3303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 331152b3e56SJunchao Zhang } 332a587d139SMark 3338c3ff71bSJunchao Zhang /* y = A x */ 334d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 335d71ae5a4SJacob Faibussowitsch { 3368c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 337152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 338152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3398c3ff71bSJunchao Zhang 3408c3ff71bSJunchao Zhang PetscFunctionBegin; 3419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3439566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3449566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3458c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3469d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3479566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3489566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 349076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3538c3ff71bSJunchao Zhang } 3548c3ff71bSJunchao Zhang 3558c3ff71bSJunchao Zhang /* y = A^T x */ 356d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 357d71ae5a4SJacob Faibussowitsch { 3588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 359152b3e56SJunchao Zhang const char *mode; 360152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 361152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3620e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3638c3ff71bSJunchao Zhang 3648c3ff71bSJunchao Zhang PetscFunctionBegin; 3659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3669566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3679566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3689566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 369152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 371152b3e56SJunchao Zhang mode = "N"; 372152b3e56SJunchao Zhang } else { 373076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3740e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 375152b3e56SJunchao Zhang mode = "T"; 376152b3e56SJunchao Zhang } 3770e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3789566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3800e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 3819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3838c3ff71bSJunchao Zhang } 3848c3ff71bSJunchao Zhang 3858c3ff71bSJunchao Zhang /* y = A^H x */ 386d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 387d71ae5a4SJacob Faibussowitsch { 3888c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 389152b3e56SJunchao Zhang const char *mode; 390152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 391152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3920e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3938c3ff71bSJunchao Zhang 3948c3ff71bSJunchao Zhang PetscFunctionBegin; 3959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3979566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3989566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 399152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 401152b3e56SJunchao Zhang mode = "N"; 402152b3e56SJunchao Zhang } else { 403076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4040e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 405152b3e56SJunchao Zhang mode = "C"; 406152b3e56SJunchao Zhang } 4070e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4089566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4099566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4100e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4138c3ff71bSJunchao Zhang } 4148c3ff71bSJunchao Zhang 4158c3ff71bSJunchao Zhang /* z = A x + y */ 416d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 417d71ae5a4SJacob Faibussowitsch { 4188c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 419152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 420152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4218c3ff71bSJunchao Zhang 4228c3ff71bSJunchao Zhang PetscFunctionBegin; 4239566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4259566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4269566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4279566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4288c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 4298c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4309d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4319566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4329566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4339566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4359566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4378c3ff71bSJunchao Zhang } 4388c3ff71bSJunchao Zhang 4398c3ff71bSJunchao Zhang /* z = A^T x + y */ 440d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 441d71ae5a4SJacob Faibussowitsch { 4428c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 443152b3e56SJunchao Zhang const char *mode; 444152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 445152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4460e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4478c3ff71bSJunchao Zhang 4488c3ff71bSJunchao Zhang PetscFunctionBegin; 4499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4519566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4529566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4539566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4548c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 455152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 457152b3e56SJunchao Zhang mode = "N"; 458152b3e56SJunchao Zhang } else { 459076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4600e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 461152b3e56SJunchao Zhang mode = "T"; 462152b3e56SJunchao Zhang } 4630e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4669566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4670e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4708c3ff71bSJunchao Zhang } 4718c3ff71bSJunchao Zhang 4728c3ff71bSJunchao Zhang /* z = A^H x + y */ 473d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 474d71ae5a4SJacob Faibussowitsch { 4758c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 476152b3e56SJunchao Zhang const char *mode; 477152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 478152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4790e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4808c3ff71bSJunchao Zhang 4818c3ff71bSJunchao Zhang PetscFunctionBegin; 4829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4849566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4859566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4869566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4878c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 488152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 490152b3e56SJunchao Zhang mode = "N"; 491152b3e56SJunchao Zhang } else { 492076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4930e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 494152b3e56SJunchao Zhang mode = "C"; 495152b3e56SJunchao Zhang } 4960e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 4979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4989566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4999566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 5000e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 503152b3e56SJunchao Zhang } 504152b3e56SJunchao Zhang 505d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 506d71ae5a4SJacob Faibussowitsch { 507152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 508152b3e56SJunchao Zhang 509152b3e56SJunchao Zhang PetscFunctionBegin; 510152b3e56SJunchao Zhang switch (op) { 511152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 512152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5139566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 514152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 515152b3e56SJunchao Zhang break; 516d71ae5a4SJacob Faibussowitsch default: 517d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 518d71ae5a4SJacob Faibussowitsch break; 519152b3e56SJunchao Zhang } 5203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5218c3ff71bSJunchao Zhang } 5228c3ff71bSJunchao Zhang 523076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 524d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 525d71ae5a4SJacob Faibussowitsch { 526076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5278c3ff71bSJunchao Zhang 5288c3ff71bSJunchao Zhang PetscFunctionBegin; 5299566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 530076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5319566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5328c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5339566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 534076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5355f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5369566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5379566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5389566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5399566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 540076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 541394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5425f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 543076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 5448c3ff71bSJunchao Zhang } 545076ba34aSJunchao Zhang } 5463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5478c3ff71bSJunchao Zhang } 5488c3ff71bSJunchao Zhang 549076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 550076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 551076ba34aSJunchao Zhang */ 552d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 553d71ae5a4SJacob Faibussowitsch { 554076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 555076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 556076ba34aSJunchao Zhang Mat mat; 5578c3ff71bSJunchao Zhang 5588c3ff71bSJunchao Zhang PetscFunctionBegin; 559076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5609566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 561076ba34aSJunchao Zhang mat = *B; 562*2c4ab24aSJunchao Zhang if (A->preallocated) { 563076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 564076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 565076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 566076ba34aSJunchao Zhang /* Now copy values to B if needed */ 567076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 568076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 569076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 570076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 571076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 572076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 573076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 574076ba34aSJunchao Zhang } 575076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 576076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 577076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 578076ba34aSJunchao Zhang } 579076ba34aSJunchao Zhang mat->spptr = bkok; 580076ba34aSJunchao Zhang } 581076ba34aSJunchao Zhang 5829566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5839566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5849566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5859566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5878c3ff71bSJunchao Zhang } 5888c3ff71bSJunchao Zhang 589d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 590d71ae5a4SJacob Faibussowitsch { 5910ecb592aSJunchao Zhang Mat At; 5920e3ece09SJunchao Zhang KokkosCsrMatrix internT; 5930ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 5940ecb592aSJunchao Zhang 5950ecb592aSJunchao Zhang PetscFunctionBegin; 5967fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 5979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 5980ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 599ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6000e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6019566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6020ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6039566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6040ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6050ecb592aSJunchao Zhang if ((*B)->assembled) { 6060ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6070e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6090ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6100ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6110e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6120e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6130e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6140e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6150ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6160ecb592aSJunchao Zhang } 6173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6180ecb592aSJunchao Zhang } 6190ecb592aSJunchao Zhang 620d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 621d71ae5a4SJacob Faibussowitsch { 62286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6238c3ff71bSJunchao Zhang 6248c3ff71bSJunchao Zhang PetscFunctionBegin; 62586a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 62686a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6278c3ff71bSJunchao Zhang delete aijkok; 62886a27549SJunchao Zhang } else { 62986a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 63086a27549SJunchao Zhang } 631cbc6b225SStefano Zampini A->spptr = NULL; 6329566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6339566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 6359566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6378c3ff71bSJunchao Zhang } 6388c3ff71bSJunchao Zhang 6393f3ba80aSJunchao Zhang /*MC 6403f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6413f3ba80aSJunchao Zhang 6423f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 6433f3ba80aSJunchao Zhang 6442ef1f0ffSBarry Smith Options Database Key: 64511a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6463f3ba80aSJunchao Zhang 6473f3ba80aSJunchao Zhang Level: beginner 6483f3ba80aSJunchao Zhang 6492ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6503f3ba80aSJunchao Zhang M*/ 651d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 652d71ae5a4SJacob Faibussowitsch { 65386a27549SJunchao Zhang PetscFunctionBegin; 6549566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6559566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6569566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 65886a27549SJunchao Zhang } 65986a27549SJunchao Zhang 660076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 661d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 662d71ae5a4SJacob Faibussowitsch { 663076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 664076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 665076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 666076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 667076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 668076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 669a3f881fbSStefano Zampini 670a3f881fbSStefano Zampini PetscFunctionBegin; 671076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 672076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 673076ba34aSJunchao Zhang PetscValidPointer(C, 4); 674076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 675076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6765f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6775f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 678076ba34aSJunchao Zhang 6799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 681076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 682076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 683076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 684076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 685076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 686076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 687076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 688076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 689076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 690076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 691076ba34aSJunchao Zhang nnz = a->nz + b->nz; 692076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 693076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 694076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 695076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 696076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 697076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 698076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 699076ba34aSJunchao Zhang ca = ca_dual.view_device(); 700076ba34aSJunchao Zhang ci = ci_dual.view_device(); 701076ba34aSJunchao Zhang cj = cj_dual.view_device(); 702076ba34aSJunchao Zhang 703076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7049371c9d4SSatish Balay Kokkos::parallel_for( 7059371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 706076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 707076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 708076ba34aSJunchao Zhang 709076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 710076ba34aSJunchao Zhang ci(i) = coffset; 711076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 712076ba34aSJunchao Zhang }); 713076ba34aSJunchao Zhang 714076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 715076ba34aSJunchao Zhang if (k < alen) { 716076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 717076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 718076ba34aSJunchao Zhang } else { 719076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 720076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 721076ba34aSJunchao Zhang } 722076ba34aSJunchao Zhang }); 723076ba34aSJunchao Zhang }); 724076ba34aSJunchao Zhang ca_dual.modify_device(); 725076ba34aSJunchao Zhang ci_dual.modify_device(); 726076ba34aSJunchao Zhang cj_dual.modify_device(); 7279566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7289566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 729076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 730076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 731076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 732076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 733076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 734076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 735076ba34aSJunchao Zhang 7369371c9d4SSatish Balay Kokkos::parallel_for( 7379371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 738076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 739076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 740076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 741076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 742076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 743076ba34aSJunchao Zhang }); 744076ba34aSJunchao Zhang }); 7459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 746076ba34aSJunchao Zhang } 7473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 748076ba34aSJunchao Zhang } 749076ba34aSJunchao Zhang 750d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 751d71ae5a4SJacob Faibussowitsch { 752076ba34aSJunchao Zhang PetscFunctionBegin; 753076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 755a3f881fbSStefano Zampini } 756a3f881fbSStefano Zampini 757d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 758d71ae5a4SJacob Faibussowitsch { 759a3f881fbSStefano Zampini Mat_Product *product = C->product; 760a3f881fbSStefano Zampini Mat A, B; 761076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 762a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 763a3f881fbSStefano Zampini Mat_SeqAIJ *c; 764076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7650e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 766a3f881fbSStefano Zampini 767a3f881fbSStefano Zampini PetscFunctionBegin; 768a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7695f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 770076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 771076ba34aSJunchao Zhang 7720e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 7730e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 7740e3ece09SJunchao Zhang // we still do numeric. 7750e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 7760e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 7773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 778076ba34aSJunchao Zhang } 779076ba34aSJunchao Zhang 780076ba34aSJunchao Zhang switch (product->type) { 7819371c9d4SSatish Balay case MATPRODUCT_AB: 7829371c9d4SSatish Balay transA = false; 7839371c9d4SSatish Balay transB = false; 7849371c9d4SSatish Balay break; 7859371c9d4SSatish Balay case MATPRODUCT_AtB: 7869371c9d4SSatish Balay transA = true; 7879371c9d4SSatish Balay transB = false; 7889371c9d4SSatish Balay break; 7899371c9d4SSatish Balay case MATPRODUCT_ABt: 7909371c9d4SSatish Balay transA = false; 7919371c9d4SSatish Balay transB = true; 7929371c9d4SSatish Balay break; 793d71ae5a4SJacob Faibussowitsch default: 794d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 795076ba34aSJunchao Zhang } 796076ba34aSJunchao Zhang 797a3f881fbSStefano Zampini A = product->A; 798a3f881fbSStefano Zampini B = product->B; 7999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 801a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 802a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 803a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 804076ba34aSJunchao Zhang 8055f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 806076ba34aSJunchao Zhang 8070e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8080e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 809076ba34aSJunchao Zhang 810076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 811076ba34aSJunchao Zhang if (transA) { 8129566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 813076ba34aSJunchao Zhang transA = false; 814a3f881fbSStefano Zampini } 815a3f881fbSStefano Zampini 816076ba34aSJunchao Zhang if (transB) { 8179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 818076ba34aSJunchao Zhang transB = false; 819076ba34aSJunchao Zhang } 8209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8210e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 823866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 824866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 825e944a159SJunchao Zhang #endif 826866eb059SJunchao Zhang 8279566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 829a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 830a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8319566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8329566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8339566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 834a3f881fbSStefano Zampini c->reallocs = 0; 835076ba34aSJunchao Zhang C->info.mallocs = 0; 836a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 837a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 838a3f881fbSStefano Zampini C->num_ass++; 8393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 840a3f881fbSStefano Zampini } 841a3f881fbSStefano Zampini 842d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 843d71ae5a4SJacob Faibussowitsch { 844076ba34aSJunchao Zhang Mat_Product *product = C->product; 845076ba34aSJunchao Zhang MatProductType ptype; 846076ba34aSJunchao Zhang Mat A, B; 847076ba34aSJunchao Zhang bool transA, transB; 848076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 849076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 850076ba34aSJunchao Zhang MPI_Comm comm; 8510e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 852a3f881fbSStefano Zampini 853a3f881fbSStefano Zampini PetscFunctionBegin; 854a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8559566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8565f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 857a3f881fbSStefano Zampini A = product->A; 858a3f881fbSStefano Zampini B = product->B; 8599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 861a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 862a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8630e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8640e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 865076ba34aSJunchao Zhang 866a3f881fbSStefano Zampini ptype = product->type; 8670e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8680e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 8690e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8700e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 8710e3ece09SJunchao Zhang } 8720e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 8730e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8740e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 8750e3ece09SJunchao Zhang } 8760e3ece09SJunchao Zhang 877a3f881fbSStefano Zampini switch (ptype) { 8789371c9d4SSatish Balay case MATPRODUCT_AB: 8799371c9d4SSatish Balay transA = false; 8809371c9d4SSatish Balay transB = false; 8819371c9d4SSatish Balay break; 8829371c9d4SSatish Balay case MATPRODUCT_AtB: 8839371c9d4SSatish Balay transA = true; 8849371c9d4SSatish Balay transB = false; 8859371c9d4SSatish Balay break; 8869371c9d4SSatish Balay case MATPRODUCT_ABt: 8879371c9d4SSatish Balay transA = false; 8889371c9d4SSatish Balay transB = true; 8899371c9d4SSatish Balay break; 890d71ae5a4SJacob Faibussowitsch default: 891d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 892a3f881fbSStefano Zampini } 8930e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 894076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 895a3f881fbSStefano Zampini 896076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 897866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 898866eb059SJunchao Zhang 899866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 900866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 901866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 902866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 903866eb059SJunchao Zhang #endif 904866eb059SJunchao Zhang #endif 9050e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 906076ba34aSJunchao Zhang 9079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 908076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 909076ba34aSJunchao Zhang if (transA) { 9109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 911076ba34aSJunchao Zhang transA = false; 912076ba34aSJunchao Zhang } 913076ba34aSJunchao Zhang 914076ba34aSJunchao Zhang if (transB) { 9159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 916076ba34aSJunchao Zhang transB = false; 917076ba34aSJunchao Zhang } 918076ba34aSJunchao Zhang 9190e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 920076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 921076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 922076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 923076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 924076ba34aSJunchao Zhang */ 9250e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9260e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 927866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 928866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 929866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 930e944a159SJunchao Zhang #endif 9319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 932076ba34aSJunchao Zhang 9339566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9349566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 935076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 937a3f881fbSStefano Zampini } 938a3f881fbSStefano Zampini 939a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 940d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 941d71ae5a4SJacob Faibussowitsch { 942076ba34aSJunchao Zhang Mat_Product *product = mat->product; 943a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 944a3f881fbSStefano Zampini 945a3f881fbSStefano Zampini PetscFunctionBegin; 946a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9479566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 94848a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 949a3f881fbSStefano Zampini if (Biskok && Ciskok) { 950a3f881fbSStefano Zampini switch (product->type) { 951a3f881fbSStefano Zampini case MATPRODUCT_AB: 952a3f881fbSStefano Zampini case MATPRODUCT_AtB: 953d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 954d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 955d71ae5a4SJacob Faibussowitsch break; 956a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 957a3f881fbSStefano Zampini case MATPRODUCT_RARt: 958d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 959d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 960d71ae5a4SJacob Faibussowitsch break; 961d71ae5a4SJacob Faibussowitsch default: 962d71ae5a4SJacob Faibussowitsch break; 963a3f881fbSStefano Zampini } 964a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9659566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 966a3f881fbSStefano Zampini } 9673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 968a3f881fbSStefano Zampini } 969a587d139SMark 970d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 971d71ae5a4SJacob Faibussowitsch { 972f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 973f0cf5187SStefano Zampini 974f0cf5187SStefano Zampini PetscFunctionBegin; 9759566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 977f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 978076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 9819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 9823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 983f0cf5187SStefano Zampini } 984f0cf5187SStefano Zampini 985d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 986d71ae5a4SJacob Faibussowitsch { 987076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 988a587d139SMark 989a587d139SMark PetscFunctionBegin; 990076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 9912328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 992076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 9939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9942328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 9959566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 9962328674fSJunchao Zhang } 9973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 998a587d139SMark } 999a587d139SMark 1000d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1001d71ae5a4SJacob Faibussowitsch { 1002f78ce678SMark Adams Mat_SeqAIJ *aijseq; 1003f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1004f78ce678SMark Adams PetscInt n; 1005f78ce678SMark Adams PetscScalarKokkosView xv; 1006f78ce678SMark Adams 1007f78ce678SMark Adams PetscFunctionBegin; 1008f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1009f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1010f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1011f78ce678SMark Adams 1012f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1013f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1014f78ce678SMark Adams 1015f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1016f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1017f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1018f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 1019f78ce678SMark Adams } 1020f78ce678SMark Adams 1021f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1022f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1023f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1024f78ce678SMark Adams 1025f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 10269371c9d4SSatish Balay Kokkos::parallel_for( 10279371c9d4SSatish Balay n, KOKKOS_LAMBDA(const PetscInt i) { 1028f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1029f78ce678SMark Adams else xv(i) = 0; 1030f78ce678SMark Adams }); 1031f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 10323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1033f78ce678SMark Adams } 1034f78ce678SMark Adams 1035db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1036d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1037d71ae5a4SJacob Faibussowitsch { 1038db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1039db78de30SJunchao Zhang 1040db78de30SJunchao Zhang PetscFunctionBegin; 1041db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1042db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1043db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1045db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1046076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1048db78de30SJunchao Zhang } 1049db78de30SJunchao Zhang 1050d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1051d71ae5a4SJacob Faibussowitsch { 1052db78de30SJunchao Zhang PetscFunctionBegin; 1053db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1054db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1055db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1057db78de30SJunchao Zhang } 1058db78de30SJunchao Zhang 1059d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1060d71ae5a4SJacob Faibussowitsch { 1061db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1062db78de30SJunchao Zhang 1063db78de30SJunchao Zhang PetscFunctionBegin; 1064db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1065db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1066db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10679566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1068db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1069076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1071db78de30SJunchao Zhang } 1072db78de30SJunchao Zhang 1073d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1074d71ae5a4SJacob Faibussowitsch { 1075db78de30SJunchao Zhang PetscFunctionBegin; 1076db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1077db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1078db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1081db78de30SJunchao Zhang } 1082db78de30SJunchao Zhang 1083d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1084d71ae5a4SJacob Faibussowitsch { 1085db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1086db78de30SJunchao Zhang 1087db78de30SJunchao Zhang PetscFunctionBegin; 1088db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1089db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1090db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1091db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1092076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1094db78de30SJunchao Zhang } 1095db78de30SJunchao Zhang 1096d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1097d71ae5a4SJacob Faibussowitsch { 1098db78de30SJunchao Zhang PetscFunctionBegin; 1099db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1100db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1101db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1104db78de30SJunchao Zhang } 1105db78de30SJunchao Zhang 1106c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1108d71ae5a4SJacob Faibussowitsch { 1109a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1110c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1111c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1112c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1113a587d139SMark 1114a587d139SMark PetscFunctionBegin; 1115c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1116c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 11179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 11189566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 11199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1120db78de30SJunchao Zhang 1121c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1122a587d139SMark PetscBool e; 11239566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1124a587d139SMark if (e) { 11259566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1126c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1127a587d139SMark } 1128a587d139SMark } 1129db78de30SJunchao Zhang 1130c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1131c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1132c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1133c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1134c17cf699SJunchao Zhang */ 1135c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1136c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1137c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1138c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1139c17cf699SJunchao Zhang 1140c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1141c17cf699SJunchao Zhang KokkosBlas::axpy(alpha, Xa, Ya); 11429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1143c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1144c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1145c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1146c17cf699SJunchao Zhang 11479371c9d4SSatish Balay Kokkos::parallel_for( 11489371c9d4SSatish Balay Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 11490e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 11500e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 11510e3ece09SJunchao Zhang // Only one thread works in a team 1152c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 11530e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 11540e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 11550e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1156c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1157c17cf699SJunchao Zhang q++; 1158a587d139SMark } else { 11590e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 11600e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 11610e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 11620e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 11638b8b16f9SJunchao Zhang #else 11640e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 11658b8b16f9SJunchao Zhang #endif 1166a587d139SMark } 1167c17cf699SJunchao Zhang } 1168c17cf699SJunchao Zhang }); 1169c17cf699SJunchao Zhang }); 11709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 11710e3ece09SJunchao Zhang } else { // different nonzero patterns 1172c17cf699SJunchao Zhang Mat Z; 1173c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1174c17cf699SJunchao Zhang KernelHandle kh; 11750e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1176c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1177c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1178c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 11799566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 11809566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1181c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1182c17cf699SJunchao Zhang } 11839566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 11840e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 11853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1186a587d139SMark } 1187a587d139SMark 1188*2c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 1189*2c4ab24aSJunchao Zhang PetscCount n; 1190*2c4ab24aSJunchao Zhang PetscCount Atot; 1191*2c4ab24aSJunchao Zhang PetscInt nz; 1192*2c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 1193*2c4ab24aSJunchao Zhang PetscCountKokkosView perm; 1194*2c4ab24aSJunchao Zhang 1195*2c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 1196*2c4ab24aSJunchao Zhang { 1197*2c4ab24aSJunchao Zhang nz = coo_h->nz; 1198*2c4ab24aSJunchao Zhang n = coo_h->n; 1199*2c4ab24aSJunchao Zhang Atot = coo_h->Atot; 1200*2c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 1201*2c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 1202*2c4ab24aSJunchao Zhang } 1203*2c4ab24aSJunchao Zhang }; 1204*2c4ab24aSJunchao Zhang 1205*2c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void *data) 1206*2c4ab24aSJunchao Zhang { 1207*2c4ab24aSJunchao Zhang PetscFunctionBegin; 1208*2c4ab24aSJunchao Zhang PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(data)); 1209*2c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1210*2c4ab24aSJunchao Zhang } 1211*2c4ab24aSJunchao Zhang 1212d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1213d71ae5a4SJacob Faibussowitsch { 121442550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 121542550becSJunchao Zhang Mat_SeqAIJ *aseq; 1216*2c4ab24aSJunchao Zhang PetscContainer container_h, container_d; 1217*2c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 1218*2c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 121942550becSJunchao Zhang 122042550becSJunchao Zhang PetscFunctionBegin; 12219566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1222394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 122342550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1224cbc6b225SStefano Zampini delete akok; 1225cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 12269566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1227*2c4ab24aSJunchao Zhang 1228*2c4ab24aSJunchao Zhang // Copy the COO struct to device 1229*2c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 1230*2c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 1231*2c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 1232*2c4ab24aSJunchao Zhang 1233*2c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 1234*2c4ab24aSJunchao Zhang PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d)); 1235*2c4ab24aSJunchao Zhang PetscCall(PetscContainerSetPointer(container_d, coo_d)); 1236*2c4ab24aSJunchao Zhang PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_SeqAIJKokkos)); 1237*2c4ab24aSJunchao Zhang PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d)); 1238*2c4ab24aSJunchao Zhang PetscCall(PetscContainerDestroy(&container_d)); 12393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 124042550becSJunchao Zhang } 124142550becSJunchao Zhang 1242d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1243d71ae5a4SJacob Faibussowitsch { 124442550becSJunchao Zhang MatScalarKokkosView Aa; 124542550becSJunchao Zhang ConstMatScalarKokkosView kv; 124642550becSJunchao Zhang PetscMemType memtype; 1247*2c4ab24aSJunchao Zhang PetscContainer container; 1248*2c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 124942550becSJunchao Zhang 125042550becSJunchao Zhang PetscFunctionBegin; 1251*2c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 1252*2c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 1253*2c4ab24aSJunchao Zhang 1254*2c4ab24aSJunchao Zhang const auto &n = coo->n; 1255*2c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 1256*2c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 1257*2c4ab24aSJunchao Zhang const auto &perm = coo->perm; 1258*2c4ab24aSJunchao Zhang 12599566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 126042550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1261*2c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 126242550becSJunchao Zhang } else { 1263*2c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 126442550becSJunchao Zhang } 126542550becSJunchao Zhang 1266c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1267c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 126842550becSJunchao Zhang 12699371c9d4SSatish Balay Kokkos::parallel_for( 12709371c9d4SSatish Balay Annz, KOKKOS_LAMBDA(const PetscCount i) { 1271c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1272c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1273c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1274c7b718f4SJunchao Zhang }); 1275394ed5ebSJunchao Zhang 12769566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 12779566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 12783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 127942550becSJunchao Zhang } 128042550becSJunchao Zhang 1281d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1282d71ae5a4SJacob Faibussowitsch { 12835fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 12845fbaff96SJunchao Zhang MatScalarKokkosView Aa; 12855fbaff96SJunchao Zhang const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 12865fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 12875fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 12885fbaff96SJunchao Zhang 12895fbaff96SJunchao Zhang PetscFunctionBegin; 12905fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 12919371c9d4SSatish Balay Kokkos::parallel_for( 12929371c9d4SSatish Balay m, KOKKOS_LAMBDA(const PetscInt i) { 12935fbaff96SJunchao Zhang PetscScalar tmp; 12945fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 12955fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 12965fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 12975fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 12985fbaff96SJunchao Zhang } 12995fbaff96SJunchao Zhang }); 13005fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 13013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13025fbaff96SJunchao Zhang } 13035fbaff96SJunchao Zhang 1304d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1305d71ae5a4SJacob Faibussowitsch { 13068f7e8f9dSMark Adams PetscFunctionBegin; 13079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 13089566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 13098f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 13103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13118f7e8f9dSMark Adams } 13128f7e8f9dSMark Adams 1313d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1314d71ae5a4SJacob Faibussowitsch { 1315076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1316076ba34aSJunchao Zhang 13178c3ff71bSJunchao Zhang PetscFunctionBegin; 1318076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 13196f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 13206f3d89d0SStefano Zampini 13218c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 13228c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 13238c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1324a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1325f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1326a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1327076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 13288c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 13298c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 13308c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 13318c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 13328c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 13338c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1334076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 13350ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1336152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1337f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1338076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1339076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1340076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1341076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1342076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1343076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 13447ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 134542550becSJunchao Zhang 13469566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 13479566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 13483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1349076ba34aSJunchao Zhang } 1350076ba34aSJunchao Zhang 1351d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1352d71ae5a4SJacob Faibussowitsch { 1353076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1354076ba34aSJunchao Zhang PetscInt i, m, n; 1355076ba34aSJunchao Zhang 1356076ba34aSJunchao Zhang PetscFunctionBegin; 13575f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1358076ba34aSJunchao Zhang 1359076ba34aSJunchao Zhang m = akok->nrows(); 1360076ba34aSJunchao Zhang n = akok->ncols(); 13619566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 13629566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1363076ba34aSJunchao Zhang 1364076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 13659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1366076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ *)(A)->data; 1367076ba34aSJunchao Zhang 1368076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1369076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1370076ba34aSJunchao Zhang 1371076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1372076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1373076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1374076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1375076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1376076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1377076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1378076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1379076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1380076ba34aSJunchao Zhang 13819566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 13829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1383ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1384076ba34aSJunchao Zhang 1385076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1386076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1387ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 13889566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 13899566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 13903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1391076ba34aSJunchao Zhang } 1392076ba34aSJunchao Zhang 13930e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 13940e3ece09SJunchao Zhang { 13950e3ece09SJunchao Zhang PetscFunctionBegin; 13960e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13970e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 13980e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13990e3ece09SJunchao Zhang } 14000e3ece09SJunchao Zhang 14010e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 14020e3ece09SJunchao Zhang { 14030e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 14040e3ece09SJunchao Zhang PetscFunctionBegin; 14050e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 14060e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 14070e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 14080e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 14090e3ece09SJunchao Zhang } 14100e3ece09SJunchao Zhang 1411076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1412076ba34aSJunchao Zhang 1413076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1414076ba34aSJunchao Zhang */ 1415d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1416d71ae5a4SJacob Faibussowitsch { 1417076ba34aSJunchao Zhang PetscFunctionBegin; 14189566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 14199566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 14203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14218c3ff71bSJunchao Zhang } 14228c3ff71bSJunchao Zhang 1423152b3e56SJunchao Zhang /*@C 142411a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 14258c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 142620f4b53cSBarry Smith Kokkos for calculations. 14278c3ff71bSJunchao Zhang 14288c3ff71bSJunchao Zhang Collective 14298c3ff71bSJunchao Zhang 14308c3ff71bSJunchao Zhang Input Parameters: 143111a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 14328c3ff71bSJunchao Zhang . m - number of rows 14338c3ff71bSJunchao Zhang . n - number of columns 143420f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 143520f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 14368c3ff71bSJunchao Zhang 14378c3ff71bSJunchao Zhang Output Parameter: 14388c3ff71bSJunchao Zhang . A - the matrix 14398c3ff71bSJunchao Zhang 14402ef1f0ffSBarry Smith Level: intermediate 14412ef1f0ffSBarry Smith 14422ef1f0ffSBarry Smith Notes: 144311a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 14448c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 144511a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 14468c3ff71bSJunchao Zhang 144711a5261eSBarry Smith The AIJ format, also called 14482ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 14498c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 145020f4b53cSBarry Smith either one (as in Fortran) or zero. 14518c3ff71bSJunchao Zhang 14522ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 14532ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 14542ef1f0ffSBarry Smith allocation. 14558c3ff71bSJunchao Zhang 14562ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 14578c3ff71bSJunchao Zhang @*/ 1458d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1459d71ae5a4SJacob Faibussowitsch { 14608c3ff71bSJunchao Zhang PetscFunctionBegin; 14619566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 14629566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 14639566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 14649566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 14659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 14663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14678c3ff71bSJunchao Zhang } 1468930e68a5SMark Adams 1469d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1470d71ae5a4SJacob Faibussowitsch { 1471930e68a5SMark Adams PetscFunctionBegin; 14729566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 147386a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 14743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 147586a27549SJunchao Zhang } 147686a27549SJunchao Zhang 1477d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1478d71ae5a4SJacob Faibussowitsch { 147986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 148086a27549SJunchao Zhang 148186a27549SJunchao Zhang PetscFunctionBegin; 148286a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 148386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 148486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 148586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 148686a27549SJunchao Zhang } 14873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148886a27549SJunchao Zhang } 148986a27549SJunchao Zhang 149086a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1491d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1492d71ae5a4SJacob Faibussowitsch { 149386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1494076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 149586a27549SJunchao Zhang 149686a27549SJunchao Zhang PetscFunctionBegin; 149786a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 149886a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 14997b8d4ba6SJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 15007b8d4ba6SJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 15017b8d4ba6SJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 150286a27549SJunchao Zhang 15039371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 150486a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 150586a27549SJunchao Zhang 150686a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 150786a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 150886a27549SJunchao Zhang */ 15099371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 151086a27549SJunchao Zhang 151186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 151286a27549SJunchao Zhang 151386a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 15147b8d4ba6SJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 15157b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 15167b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 151786a27549SJunchao Zhang 15189371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 151986a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 152086a27549SJunchao Zhang 152186a27549SJunchao Zhang /* Sort indices. See comments above */ 15229371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 152386a27549SJunchao Zhang 152486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 152586a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 152686a27549SJunchao Zhang } 15273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 152886a27549SJunchao Zhang } 152986a27549SJunchao Zhang 153086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1531d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1532d71ae5a4SJacob Faibussowitsch { 153386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 153486a27549SJunchao Zhang PetscScalarKokkosView xv; 153586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 153686a27549SJunchao Zhang 153786a27549SJunchao Zhang PetscFunctionBegin; 15389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 15409566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 15419566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 154286a27549SJunchao Zhang /* Solve L tmpv = b */ 15439566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 154486a27549SJunchao Zhang /* Solve Ux = tmpv */ 15459566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 15469566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 15479566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 15489566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 155086a27549SJunchao Zhang } 155186a27549SJunchao Zhang 1552076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1553d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1554d71ae5a4SJacob Faibussowitsch { 155586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 155686a27549SJunchao Zhang PetscScalarKokkosView xv; 155786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 155886a27549SJunchao Zhang 155986a27549SJunchao Zhang PetscFunctionBegin; 15609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 15629566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 15639566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 156486a27549SJunchao Zhang /* Solve U^T tmpv = b */ 156586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 156686a27549SJunchao Zhang 156786a27549SJunchao Zhang /* Solve L^T x = tmpv */ 156886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 15699566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 15709566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 15719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 157386a27549SJunchao Zhang } 157486a27549SJunchao Zhang 1575d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1576d71ae5a4SJacob Faibussowitsch { 157786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 157886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 157986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 158086a27549SJunchao Zhang 158186a27549SJunchao Zhang PetscFunctionBegin; 15829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 15839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1584076ba34aSJunchao Zhang 1585076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1586076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1587076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1588076ba34aSJunchao Zhang 1589076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 159086a27549SJunchao Zhang 159186a27549SJunchao Zhang B->assembled = PETSC_TRUE; 159286a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 159386a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 159486a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 159586a27549SJunchao Zhang B->ops->matsolve = NULL; 159686a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 159786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 159886a27549SJunchao Zhang 159986a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 160086a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 160186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1602eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 16039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 160586a27549SJunchao Zhang } 160686a27549SJunchao Zhang 1607d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1608d71ae5a4SJacob Faibussowitsch { 160986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 161086a27549SJunchao Zhang Mat_SeqAIJ *b; 161186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 161286a27549SJunchao Zhang PetscInt fill_lev = info->levels; 161386a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 161486a27549SJunchao Zhang PetscInt n = A->rmap->n; 161586a27549SJunchao Zhang 161686a27549SJunchao Zhang PetscFunctionBegin; 16179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 161886a27549SJunchao Zhang /* Rebuild factors */ 16199371c9d4SSatish Balay if (factors) { 16209371c9d4SSatish Balay factors->Destroy(); 16219371c9d4SSatish Balay } /* Destroy the old if it exists */ 16229371c9d4SSatish Balay else { 16239371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 16249371c9d4SSatish Balay } 162586a27549SJunchao Zhang 162686a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 162786a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 162886a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 162986a27549SJunchao Zhang 163086a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 163186a27549SJunchao Zhang 163286a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 163386a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 163486a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 163586a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 163686a27549SJunchao Zhang 163786a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1638076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1639076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1640076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 164186a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 164286a27549SJunchao Zhang 164386a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 164486a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 164586a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 164686a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 164786a27549SJunchao Zhang 164886a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 164986a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 165086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 165186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 165286a27549SJunchao Zhang #else 165386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 165486a27549SJunchao Zhang #endif 165586a27549SJunchao Zhang 165686a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 165786a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 165886a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 165986a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 166086a27549SJunchao Zhang 166186a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 16629566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 166386a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 166486a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 166586a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1666a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 166786a27549SJunchao Zhang 166886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 166986a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 16703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1671930e68a5SMark Adams } 1672930e68a5SMark Adams 1673d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1674d71ae5a4SJacob Faibussowitsch { 1675930e68a5SMark Adams PetscFunctionBegin; 1676930e68a5SMark Adams *type = MATSOLVERKOKKOS; 16773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1678930e68a5SMark Adams } 1679930e68a5SMark Adams 1680930e68a5SMark Adams /*MC 168186a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 168211a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1683930e68a5SMark Adams 1684930e68a5SMark Adams Level: beginner 1685930e68a5SMark Adams 16862ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1687930e68a5SMark Adams M*/ 168886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1689930e68a5SMark Adams { 1690930e68a5SMark Adams PetscInt n = A->rmap->n; 1691930e68a5SMark Adams 1692930e68a5SMark Adams PetscFunctionBegin; 16939566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 16949566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1695930e68a5SMark Adams (*B)->factortype = ftype; 16969566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 16979566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1698930e68a5SMark Adams 16998f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17009566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 170186a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 170286a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 170386a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 17049566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 170586a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 170686a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 170798921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1708930e68a5SMark Adams 17099566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 17109566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 17113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1712930e68a5SMark Adams } 17138f7e8f9dSMark Adams 1714d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1715d71ae5a4SJacob Faibussowitsch { 171686a27549SJunchao Zhang PetscFunctionBegin; 17179566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 17189566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 17193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 172086a27549SJunchao Zhang } 172186a27549SJunchao Zhang 1722076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1723d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1724d71ae5a4SJacob Faibussowitsch { 1725076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1726076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1727076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1728076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1729076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1730076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1731076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1732076ba34aSJunchao Zhang 1733076ba34aSJunchao Zhang PetscFunctionBegin; 17349566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1735076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 17369566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 173748a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 17389566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1739076ba34aSJunchao Zhang } 17403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1741076ba34aSJunchao Zhang } 1742