1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp> 211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 3076ba34aSJunchao Zhang #include <petscpkg_version.h> 4152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 542550becSJunchao Zhang #include <petsc/private/sfimpl.h> 68c3ff71bSJunchao Zhang #include <petscsystypes.h> 78c3ff71bSJunchao Zhang #include <petscerror.h> 88c3ff71bSJunchao Zhang 98c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 10f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 179d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp> 189d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp> 1986a27549SJunchao Zhang 2042550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 218c3ff71bSJunchao Zhang 220e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 23f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 24f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 259371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 26f98996d3SJunchao Zhang #else 27f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 28f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 299371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 30f98996d3SJunchao Zhang #endif 31f98996d3SJunchao Zhang 328c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 338c3ff71bSJunchao Zhang 34076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 35076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 36076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 37076ba34aSJunchao Zhang */ 38d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 39d71ae5a4SJacob Faibussowitsch { 40076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 41076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 428c3ff71bSJunchao Zhang 438c3ff71bSJunchao Zhang PetscFunctionBegin; 443ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 459566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 46076ba34aSJunchao Zhang 47076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 48076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 49076ba34aSJunchao Zhang 50076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 51076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 52076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 53076ba34aSJunchao Zhang */ 54076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 55076ba34aSJunchao Zhang delete aijkok; 56f4747e26SJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 57076ba34aSJunchao Zhang A->spptr = aijkok; 58f4747e26SJunchao Zhang } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 59f4747e26SJunchao Zhang MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 60f4747e26SJunchao Zhang auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 61f4747e26SJunchao Zhang aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 62076ba34aSJunchao Zhang } 633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 648c3ff71bSJunchao Zhang } 658c3ff71bSJunchao Zhang 6686a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 67d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 68d71ae5a4SJacob Faibussowitsch { 698c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 708c3ff71bSJunchao Zhang 718c3ff71bSJunchao Zhang PetscFunctionBegin; 72aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 735f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 74076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 75076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 76580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 788c3ff71bSJunchao Zhang } 793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 808c3ff71bSJunchao Zhang } 818c3ff71bSJunchao Zhang 82076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 83d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 84d71ae5a4SJacob Faibussowitsch { 8586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8686a27549SJunchao Zhang 8786a27549SJunchao Zhang PetscFunctionBegin; 885f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8986a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 9086a27549SJunchao Zhang aijkok->a_dual.modify_device(); 9186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 9286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 949566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9686a27549SJunchao Zhang } 9786a27549SJunchao Zhang 98d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 99d71ae5a4SJacob Faibussowitsch { 100f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 101e36ced11SJunchao Zhang auto &exec = PetscGetKokkosExecutionSpace(); 102f0cf5187SStefano Zampini 103f0cf5187SStefano Zampini PetscFunctionBegin; 104f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10586a27549SJunchao Zhang /* We do not expect one needs factors on host */ 106aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1075f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 108e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 109e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 111f0cf5187SStefano Zampini } 112f0cf5187SStefano Zampini 113d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 114d71ae5a4SJacob Faibussowitsch { 115076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 116f0cf5187SStefano Zampini 117f0cf5187SStefano Zampini PetscFunctionBegin; 1185519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1195519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1205519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1215519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1225519a089SJose E. Roman */ 1235519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 124e36ced11SJunchao Zhang auto &exec = PetscGetKokkosExecutionSpace(); 125e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 126e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 127076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 128076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 129076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 130076ba34aSJunchao Zhang } 1313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 132076ba34aSJunchao Zhang } 133076ba34aSJunchao Zhang 134d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 135d71ae5a4SJacob Faibussowitsch { 136076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 137076ba34aSJunchao Zhang 138076ba34aSJunchao Zhang PetscFunctionBegin; 1395519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 141076ba34aSJunchao Zhang } 142076ba34aSJunchao Zhang 143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 144d71ae5a4SJacob Faibussowitsch { 145076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146076ba34aSJunchao Zhang 147076ba34aSJunchao Zhang PetscFunctionBegin; 1485519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 149e36ced11SJunchao Zhang auto &exec = PetscGetKokkosExecutionSpace(); 150e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 151e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 152076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1532328674fSJunchao Zhang } else { 1542328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1552328674fSJunchao Zhang } 1563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 157076ba34aSJunchao Zhang } 158076ba34aSJunchao Zhang 159d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 160d71ae5a4SJacob Faibussowitsch { 161076ba34aSJunchao Zhang PetscFunctionBegin; 162076ba34aSJunchao Zhang *array = NULL; 1633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 164076ba34aSJunchao Zhang } 165076ba34aSJunchao Zhang 166d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 167d71ae5a4SJacob Faibussowitsch { 168076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 169076ba34aSJunchao Zhang 170076ba34aSJunchao Zhang PetscFunctionBegin; 1715519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 172076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1732328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1742328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1752328674fSJunchao Zhang } 1763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177076ba34aSJunchao Zhang } 178076ba34aSJunchao Zhang 179d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 180d71ae5a4SJacob Faibussowitsch { 181076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 182076ba34aSJunchao Zhang 183076ba34aSJunchao Zhang PetscFunctionBegin; 1845519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 185076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 186076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1872328674fSJunchao Zhang } 1883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 189f0cf5187SStefano Zampini } 190f0cf5187SStefano Zampini 191d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 192d71ae5a4SJacob Faibussowitsch { 1937ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1947ee59b9bSJunchao Zhang 1957ee59b9bSJunchao Zhang PetscFunctionBegin; 1967ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1977ee59b9bSJunchao Zhang 1987ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1997ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 2007ee59b9bSJunchao Zhang if (a) { 2017ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 2027ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 2037ee59b9bSJunchao Zhang } 2047ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 2053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2067ee59b9bSJunchao Zhang } 2077ee59b9bSJunchao Zhang 2080e3ece09SJunchao Zhang /* 2090e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2100e3ece09SJunchao Zhang 2110e3ece09SJunchao Zhang Input Parameter: 2120e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2130e3ece09SJunchao Zhang 2140e3ece09SJunchao Zhang Output Parameters: 2150e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 216aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2170e3ece09SJunchao Zhang */ 2180e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 219d71ae5a4SJacob Faibussowitsch { 2200e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2210e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2220e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2237b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2240e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2257b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2267b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2270e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2280e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2290e3ece09SJunchao Zhang PetscInt *offset; 230152b3e56SJunchao Zhang 231152b3e56SJunchao Zhang PetscFunctionBegin; 2320e3ece09SJunchao Zhang // Populate Ti 2330e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2340e3ece09SJunchao Zhang Ti++; 2350e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2360e3ece09SJunchao Zhang Ti--; 2370e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2380e3ece09SJunchao Zhang 2390e3ece09SJunchao Zhang // Populate Tj and the permutation array 2400e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2410e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2420e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2430e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2440e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2450e3ece09SJunchao Zhang 2460e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2470e3ece09SJunchao Zhang perm[disp] = j; 2480e3ece09SJunchao Zhang offset[r]++; 249076ba34aSJunchao Zhang } 2500e3ece09SJunchao Zhang } 2510e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2520e3ece09SJunchao Zhang 2530e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2540e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2550e3ece09SJunchao Zhang 2560e3ece09SJunchao Zhang // Output perm and T on device 2570e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2580e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2590e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2600e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 262152b3e56SJunchao Zhang } 263152b3e56SJunchao Zhang 2640e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2650e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2660e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 267d71ae5a4SJacob Faibussowitsch { 2680e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2690e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2700e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2710e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 272152b3e56SJunchao Zhang 273152b3e56SJunchao Zhang PetscFunctionBegin; 2740e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 275145b44c9SPierre Jolivet PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 2760e3ece09SJunchao Zhang 2770e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2780e3ece09SJunchao Zhang 2790e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2800e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 2810e3ece09SJunchao Zhang } else { 2820e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 2830e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 2840e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 2850e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 2860e3ece09SJunchao Zhang auto &Ta = T.values; 2870e3ece09SJunchao Zhang 288d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 289076ba34aSJunchao Zhang } 2900e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 2910e3ece09SJunchao Zhang MatRowMapKokkosView perm; 2920e3ece09SJunchao Zhang 2930e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 2940e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 295d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 2960e3ece09SJunchao Zhang } 2970e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 2980e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 2990e3ece09SJunchao Zhang } 3000e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3010e3ece09SJunchao Zhang } 3020e3ece09SJunchao Zhang 3030e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3040e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3050e3ece09SJunchao Zhang { 3060e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3070e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3080e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3090e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3100e3ece09SJunchao Zhang 3110e3ece09SJunchao Zhang PetscFunctionBegin; 3120e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3130e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3140e3ece09SJunchao Zhang 3150e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3160e3ece09SJunchao Zhang 3170e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3180e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3190e3ece09SJunchao Zhang } else { 3200e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3210e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3220e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3230e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3240e3ece09SJunchao Zhang auto &Ta = T.values; 3250e3ece09SJunchao Zhang 326d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3270e3ece09SJunchao Zhang } 3280e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3290e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3300e3ece09SJunchao Zhang 3310e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3320e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 333d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3340e3ece09SJunchao Zhang } 3350e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3360e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3370e3ece09SJunchao Zhang } 3383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 339152b3e56SJunchao Zhang } 340a587d139SMark 3418c3ff71bSJunchao Zhang /* y = A x */ 342d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 343d71ae5a4SJacob Faibussowitsch { 3448c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 345152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 346152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3478c3ff71bSJunchao Zhang 3488c3ff71bSJunchao Zhang PetscFunctionBegin; 3499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3519566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3529566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3538c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 354d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3559566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3569566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 357076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3618c3ff71bSJunchao Zhang } 3628c3ff71bSJunchao Zhang 3638c3ff71bSJunchao Zhang /* y = A^T x */ 364d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 365d71ae5a4SJacob Faibussowitsch { 3668c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 367152b3e56SJunchao Zhang const char *mode; 368152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 369152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3700e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3718c3ff71bSJunchao Zhang 3728c3ff71bSJunchao Zhang PetscFunctionBegin; 3739566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3759566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3769566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 377152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 379152b3e56SJunchao Zhang mode = "N"; 380152b3e56SJunchao Zhang } else { 381076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3820e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 383152b3e56SJunchao Zhang mode = "T"; 384152b3e56SJunchao Zhang } 385d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3869566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3879566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3880e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 3899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3918c3ff71bSJunchao Zhang } 3928c3ff71bSJunchao Zhang 3938c3ff71bSJunchao Zhang /* y = A^H x */ 394d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 395d71ae5a4SJacob Faibussowitsch { 3968c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 397152b3e56SJunchao Zhang const char *mode; 398152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 399152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4000e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4018c3ff71bSJunchao Zhang 4028c3ff71bSJunchao Zhang PetscFunctionBegin; 4039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4059566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4069566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 407152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 409152b3e56SJunchao Zhang mode = "N"; 410152b3e56SJunchao Zhang } else { 411076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4120e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 413152b3e56SJunchao Zhang mode = "C"; 414152b3e56SJunchao Zhang } 415d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4169566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4179566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4180e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4199566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4218c3ff71bSJunchao Zhang } 4228c3ff71bSJunchao Zhang 4238c3ff71bSJunchao Zhang /* z = A x + y */ 424d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 425d71ae5a4SJacob Faibussowitsch { 4268c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 42792896123SJunchao Zhang ConstPetscScalarKokkosView xv; 428152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4298c3ff71bSJunchao Zhang 4308c3ff71bSJunchao Zhang PetscFunctionBegin; 4319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 43392896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 4349566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 43592896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 4368c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 437d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4389566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 43992896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4438c3ff71bSJunchao Zhang } 4448c3ff71bSJunchao Zhang 4458c3ff71bSJunchao Zhang /* z = A^T x + y */ 446d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 447d71ae5a4SJacob Faibussowitsch { 4488c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 449152b3e56SJunchao Zhang const char *mode; 45092896123SJunchao Zhang ConstPetscScalarKokkosView xv; 451152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4520e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4538c3ff71bSJunchao Zhang 4548c3ff71bSJunchao Zhang PetscFunctionBegin; 4559566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 45792896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4589566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 45992896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 460152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 462152b3e56SJunchao Zhang mode = "N"; 463152b3e56SJunchao Zhang } else { 464076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4650e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 466152b3e56SJunchao Zhang mode = "T"; 467152b3e56SJunchao Zhang } 468d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4699566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 47092896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4710e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4748c3ff71bSJunchao Zhang } 4758c3ff71bSJunchao Zhang 4768c3ff71bSJunchao Zhang /* z = A^H x + y */ 477d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 478d71ae5a4SJacob Faibussowitsch { 4798c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 480152b3e56SJunchao Zhang const char *mode; 48192896123SJunchao Zhang ConstPetscScalarKokkosView xv; 482152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4830e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4848c3ff71bSJunchao Zhang 4858c3ff71bSJunchao Zhang PetscFunctionBegin; 4869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 48892896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4899566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 49092896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 491152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 493152b3e56SJunchao Zhang mode = "N"; 494152b3e56SJunchao Zhang } else { 495076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4960e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 497152b3e56SJunchao Zhang mode = "C"; 498152b3e56SJunchao Zhang } 499d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5009566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 50192896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5020e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 505152b3e56SJunchao Zhang } 506152b3e56SJunchao Zhang 50766976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 508d71ae5a4SJacob Faibussowitsch { 509152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 510152b3e56SJunchao Zhang 511152b3e56SJunchao Zhang PetscFunctionBegin; 512152b3e56SJunchao Zhang switch (op) { 513152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 514152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5159566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 516152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 517152b3e56SJunchao Zhang break; 518d71ae5a4SJacob Faibussowitsch default: 519d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 520d71ae5a4SJacob Faibussowitsch break; 521152b3e56SJunchao Zhang } 5223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5238c3ff71bSJunchao Zhang } 5248c3ff71bSJunchao Zhang 525076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 526d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 527d71ae5a4SJacob Faibussowitsch { 528076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5298c3ff71bSJunchao Zhang 5308c3ff71bSJunchao Zhang PetscFunctionBegin; 5319566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 532076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5339566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5348c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5359566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 536076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5375f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5389566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5399566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5409566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5419566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 542076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 543394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5445f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 545f4747e26SJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 5468c3ff71bSJunchao Zhang } 547076ba34aSJunchao Zhang } 5483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5498c3ff71bSJunchao Zhang } 5508c3ff71bSJunchao Zhang 551076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 552076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 553076ba34aSJunchao Zhang */ 554d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 555d71ae5a4SJacob Faibussowitsch { 556076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 557076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 558076ba34aSJunchao Zhang Mat mat; 5598c3ff71bSJunchao Zhang 5608c3ff71bSJunchao Zhang PetscFunctionBegin; 561076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5629566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 563076ba34aSJunchao Zhang mat = *B; 564f4747e26SJunchao Zhang if (A->assembled) { 565076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 566f4747e26SJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 567076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 568076ba34aSJunchao Zhang /* Now copy values to B if needed */ 569076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 570076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 571076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 572076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 573076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 574076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 575076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 576076ba34aSJunchao Zhang } 577076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 578076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 579076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 580076ba34aSJunchao Zhang } 581076ba34aSJunchao Zhang mat->spptr = bkok; 582076ba34aSJunchao Zhang } 583076ba34aSJunchao Zhang 5849566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5859566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5869566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5879566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5898c3ff71bSJunchao Zhang } 5908c3ff71bSJunchao Zhang 591d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 592d71ae5a4SJacob Faibussowitsch { 5930ecb592aSJunchao Zhang Mat At; 5940e3ece09SJunchao Zhang KokkosCsrMatrix internT; 5950ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 5960ecb592aSJunchao Zhang 5970ecb592aSJunchao Zhang PetscFunctionBegin; 5987fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 5999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6000ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 601ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6020e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6039566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6040ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6059566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6060ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6070ecb592aSJunchao Zhang if ((*B)->assembled) { 6080ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6090e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6110ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6120ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6130e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6140e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6150e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6160e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6170ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6180ecb592aSJunchao Zhang } 6193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6200ecb592aSJunchao Zhang } 6210ecb592aSJunchao Zhang 622d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 623d71ae5a4SJacob Faibussowitsch { 62486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6258c3ff71bSJunchao Zhang 6268c3ff71bSJunchao Zhang PetscFunctionBegin; 62786a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 62886a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6298c3ff71bSJunchao Zhang delete aijkok; 63086a27549SJunchao Zhang } else { 63186a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 63286a27549SJunchao Zhang } 633cbc6b225SStefano Zampini A->spptr = NULL; 6349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6359566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6369566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 63757761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 63857761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 63957761e9aSJunchao Zhang #endif 6409566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6428c3ff71bSJunchao Zhang } 6438c3ff71bSJunchao Zhang 6443f3ba80aSJunchao Zhang /*MC 6453f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6463f3ba80aSJunchao Zhang 64715229ffcSPierre Jolivet A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 6483f3ba80aSJunchao Zhang 6492ef1f0ffSBarry Smith Options Database Key: 65011a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6513f3ba80aSJunchao Zhang 6523f3ba80aSJunchao Zhang Level: beginner 6533f3ba80aSJunchao Zhang 6541cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6553f3ba80aSJunchao Zhang M*/ 656d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 657d71ae5a4SJacob Faibussowitsch { 65886a27549SJunchao Zhang PetscFunctionBegin; 6599566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6609566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6619566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6623ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 66386a27549SJunchao Zhang } 66486a27549SJunchao Zhang 665076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 666d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 667d71ae5a4SJacob Faibussowitsch { 668076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 669076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 670076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 671076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 672076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 673076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 674a3f881fbSStefano Zampini 675a3f881fbSStefano Zampini PetscFunctionBegin; 676076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 677076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 6784f572ea9SToby Isaac PetscAssertPointer(C, 4); 679076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 680076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6815f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6825f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 683076ba34aSJunchao Zhang 6849566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 686076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 687076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 688076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 689076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 690076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 691076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 692076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 693076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 694076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 695076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 696076ba34aSJunchao Zhang nnz = a->nz + b->nz; 697076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 698076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 699076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 700076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 701076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 702076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 703076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 704076ba34aSJunchao Zhang ca = ca_dual.view_device(); 705076ba34aSJunchao Zhang ci = ci_dual.view_device(); 706076ba34aSJunchao Zhang cj = cj_dual.view_device(); 707076ba34aSJunchao Zhang 708076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7099371c9d4SSatish Balay Kokkos::parallel_for( 710d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 711076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 712076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 713076ba34aSJunchao Zhang 714076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 715076ba34aSJunchao Zhang ci(i) = coffset; 716076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 717076ba34aSJunchao Zhang }); 718076ba34aSJunchao Zhang 719076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 720076ba34aSJunchao Zhang if (k < alen) { 721076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 722076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 723076ba34aSJunchao Zhang } else { 724076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 725076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 726076ba34aSJunchao Zhang } 727076ba34aSJunchao Zhang }); 728076ba34aSJunchao Zhang }); 729076ba34aSJunchao Zhang ca_dual.modify_device(); 730076ba34aSJunchao Zhang ci_dual.modify_device(); 731076ba34aSJunchao Zhang cj_dual.modify_device(); 7329566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7339566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 734076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 735076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 736076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 737076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 738076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 739076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 740076ba34aSJunchao Zhang 7419371c9d4SSatish Balay Kokkos::parallel_for( 742d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 743076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 744076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 745076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 747076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 748076ba34aSJunchao Zhang }); 749076ba34aSJunchao Zhang }); 7509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 751076ba34aSJunchao Zhang } 7523ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 753076ba34aSJunchao Zhang } 754076ba34aSJunchao Zhang 755d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 756d71ae5a4SJacob Faibussowitsch { 757076ba34aSJunchao Zhang PetscFunctionBegin; 758076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 760a3f881fbSStefano Zampini } 761a3f881fbSStefano Zampini 762d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 763d71ae5a4SJacob Faibussowitsch { 764a3f881fbSStefano Zampini Mat_Product *product = C->product; 765a3f881fbSStefano Zampini Mat A, B; 766076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 767a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 768a3f881fbSStefano Zampini Mat_SeqAIJ *c; 769076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7700e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 771a3f881fbSStefano Zampini 772a3f881fbSStefano Zampini PetscFunctionBegin; 773a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7745f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 775076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 776076ba34aSJunchao Zhang 7770e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 7780e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 7790e3ece09SJunchao Zhang // we still do numeric. 7800e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 7810e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 7823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 783076ba34aSJunchao Zhang } 784076ba34aSJunchao Zhang 785076ba34aSJunchao Zhang switch (product->type) { 7869371c9d4SSatish Balay case MATPRODUCT_AB: 7879371c9d4SSatish Balay transA = false; 7889371c9d4SSatish Balay transB = false; 7899371c9d4SSatish Balay break; 7909371c9d4SSatish Balay case MATPRODUCT_AtB: 7919371c9d4SSatish Balay transA = true; 7929371c9d4SSatish Balay transB = false; 7939371c9d4SSatish Balay break; 7949371c9d4SSatish Balay case MATPRODUCT_ABt: 7959371c9d4SSatish Balay transA = false; 7969371c9d4SSatish Balay transB = true; 7979371c9d4SSatish Balay break; 798d71ae5a4SJacob Faibussowitsch default: 799d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 800076ba34aSJunchao Zhang } 801076ba34aSJunchao Zhang 802a3f881fbSStefano Zampini A = product->A; 803a3f881fbSStefano Zampini B = product->B; 8049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 806a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 807a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 808a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 809076ba34aSJunchao Zhang 8105f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 811076ba34aSJunchao Zhang 8120e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8130e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 814076ba34aSJunchao Zhang 815076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 816076ba34aSJunchao Zhang if (transA) { 8179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 818076ba34aSJunchao Zhang transA = false; 819a3f881fbSStefano Zampini } 820a3f881fbSStefano Zampini 821076ba34aSJunchao Zhang if (transB) { 8229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 823076ba34aSJunchao Zhang transB = false; 824076ba34aSJunchao Zhang } 8259566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8260e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8270e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 828866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 829866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 830e944a159SJunchao Zhang #endif 831866eb059SJunchao Zhang 8329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 834a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 835a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8369566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8379566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8389566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 839a3f881fbSStefano Zampini c->reallocs = 0; 840076ba34aSJunchao Zhang C->info.mallocs = 0; 841a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 842a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 843a3f881fbSStefano Zampini C->num_ass++; 8443ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 845a3f881fbSStefano Zampini } 846a3f881fbSStefano Zampini 847d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 848d71ae5a4SJacob Faibussowitsch { 849076ba34aSJunchao Zhang Mat_Product *product = C->product; 850076ba34aSJunchao Zhang MatProductType ptype; 851076ba34aSJunchao Zhang Mat A, B; 852076ba34aSJunchao Zhang bool transA, transB; 853076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 854076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 855076ba34aSJunchao Zhang MPI_Comm comm; 8560e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 857a3f881fbSStefano Zampini 858a3f881fbSStefano Zampini PetscFunctionBegin; 859a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8609566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8615f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 862a3f881fbSStefano Zampini A = product->A; 863a3f881fbSStefano Zampini B = product->B; 8649566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 866a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 867a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8680e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8690e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 870076ba34aSJunchao Zhang 871a3f881fbSStefano Zampini ptype = product->type; 8720e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8730e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 8740e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8750e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 8760e3ece09SJunchao Zhang } 8770e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 8780e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8790e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 8800e3ece09SJunchao Zhang } 8810e3ece09SJunchao Zhang 882a3f881fbSStefano Zampini switch (ptype) { 8839371c9d4SSatish Balay case MATPRODUCT_AB: 8849371c9d4SSatish Balay transA = false; 8859371c9d4SSatish Balay transB = false; 886*0e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 8879371c9d4SSatish Balay break; 8889371c9d4SSatish Balay case MATPRODUCT_AtB: 8899371c9d4SSatish Balay transA = true; 8909371c9d4SSatish Balay transB = false; 891*0e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 892*0e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 8939371c9d4SSatish Balay break; 8949371c9d4SSatish Balay case MATPRODUCT_ABt: 8959371c9d4SSatish Balay transA = false; 8969371c9d4SSatish Balay transB = true; 897*0e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 898*0e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 8999371c9d4SSatish Balay break; 900d71ae5a4SJacob Faibussowitsch default: 901d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 902a3f881fbSStefano Zampini } 9030e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 904076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 905a3f881fbSStefano Zampini 906076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 907866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 908866eb059SJunchao Zhang 909866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 910866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 911866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 912866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 913866eb059SJunchao Zhang #endif 914866eb059SJunchao Zhang #endif 9150e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 916076ba34aSJunchao Zhang 9179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 918076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 919076ba34aSJunchao Zhang if (transA) { 9209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 921076ba34aSJunchao Zhang transA = false; 922076ba34aSJunchao Zhang } 923076ba34aSJunchao Zhang 924076ba34aSJunchao Zhang if (transB) { 9259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 926076ba34aSJunchao Zhang transB = false; 927076ba34aSJunchao Zhang } 928076ba34aSJunchao Zhang 9290e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 930076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 931076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 932076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 933076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 934076ba34aSJunchao Zhang */ 9350e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9360e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 937866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 938866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 939866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 940e944a159SJunchao Zhang #endif 9419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 942076ba34aSJunchao Zhang 9439566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9449566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 945076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 947a3f881fbSStefano Zampini } 948a3f881fbSStefano Zampini 949a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 950d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 951d71ae5a4SJacob Faibussowitsch { 952076ba34aSJunchao Zhang Mat_Product *product = mat->product; 953a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 954a3f881fbSStefano Zampini 955a3f881fbSStefano Zampini PetscFunctionBegin; 956a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9579566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 95848a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 959a3f881fbSStefano Zampini if (Biskok && Ciskok) { 960a3f881fbSStefano Zampini switch (product->type) { 961a3f881fbSStefano Zampini case MATPRODUCT_AB: 962a3f881fbSStefano Zampini case MATPRODUCT_AtB: 963d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 964d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 965d71ae5a4SJacob Faibussowitsch break; 966a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 967a3f881fbSStefano Zampini case MATPRODUCT_RARt: 968d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 969d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 970d71ae5a4SJacob Faibussowitsch break; 971d71ae5a4SJacob Faibussowitsch default: 972d71ae5a4SJacob Faibussowitsch break; 973a3f881fbSStefano Zampini } 974a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9759566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 976a3f881fbSStefano Zampini } 9773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 978a3f881fbSStefano Zampini } 979a587d139SMark 980d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 981d71ae5a4SJacob Faibussowitsch { 982f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 983f0cf5187SStefano Zampini 984f0cf5187SStefano Zampini PetscFunctionBegin; 9859566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 987f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 988d326c3f1SJunchao Zhang KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9899566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9909566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 9919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 9923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 993f0cf5187SStefano Zampini } 994f0cf5187SStefano Zampini 995f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 996f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 997f4747e26SJunchao Zhang { 998f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 999f4747e26SJunchao Zhang 1000f4747e26SJunchao Zhang PetscFunctionBegin; 1001f4747e26SJunchao Zhang if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1002f4747e26SJunchao Zhang PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1003f4747e26SJunchao Zhang 1004f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1005f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1006f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1007f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1008f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1009d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1010f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1011f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1012f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1013f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1014f4747e26SJunchao Zhang PetscCall(MatShift_Basic(A, a)); 1015f4747e26SJunchao Zhang } 1016f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1017f4747e26SJunchao Zhang } 1018f4747e26SJunchao Zhang 1019f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1020f4747e26SJunchao Zhang { 1021f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1022f4747e26SJunchao Zhang 1023f4747e26SJunchao Zhang PetscFunctionBegin; 1024f4747e26SJunchao Zhang if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1025f4747e26SJunchao Zhang ConstPetscScalarKokkosView dv; 1026f4747e26SJunchao Zhang PetscInt n, nv; 1027f4747e26SJunchao Zhang 1028f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1029f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1030f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(D, &dv)); 1031f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(D, &nv)); 1032f4747e26SJunchao Zhang n = PetscMin(Y->rmap->n, Y->cmap->n); 1033f4747e26SJunchao Zhang PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1034f4747e26SJunchao Zhang 1035f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1036f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1037f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1038f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1039d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1040f4747e26SJunchao Zhang if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1041f4747e26SJunchao Zhang else Aa(Adiag(i)) += dv(i); 1042f4747e26SJunchao Zhang })); 1043f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(D, &dv)); 1044f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1045f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1046f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1047f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1048f4747e26SJunchao Zhang PetscCall(MatDiagonalSet_Default(Y, D, is)); 1049f4747e26SJunchao Zhang } 1050f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1051f4747e26SJunchao Zhang } 1052f4747e26SJunchao Zhang 1053f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1054f4747e26SJunchao Zhang { 1055f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1056f4747e26SJunchao Zhang PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1057f4747e26SJunchao Zhang ConstPetscScalarKokkosView lv, rv; 1058f4747e26SJunchao Zhang 1059f4747e26SJunchao Zhang PetscFunctionBegin; 1060f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1061f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1062f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1063f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1064f4747e26SJunchao Zhang const auto &Ai = aijkok->i_dual.view_device(); 1065f4747e26SJunchao Zhang const auto &Aj = aijkok->j_dual.view_device(); 1066f4747e26SJunchao Zhang if (ll) { 1067f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(ll, &m)); 1068f4747e26SJunchao Zhang PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1069f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(ll, &lv)); 1070f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each row 1071d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1072f4747e26SJunchao Zhang PetscInt i = t.league_rank(); // row i 1073f4747e26SJunchao Zhang PetscInt len = Ai(i + 1) - Ai(i); 1074f4747e26SJunchao Zhang // scale entries on the row 1075f4747e26SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1076f4747e26SJunchao Zhang })); 1077f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(ll, &lv)); 1078f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1079f4747e26SJunchao Zhang } 1080f4747e26SJunchao Zhang if (rr) { 1081f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(rr, &n)); 1082f4747e26SJunchao Zhang PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1083f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(rr, &rv)); 1084f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1085d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1086f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(rr, &lv)); 1087f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1088f4747e26SJunchao Zhang } 1089f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1090f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1091f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1092f4747e26SJunchao Zhang } 1093f4747e26SJunchao Zhang 1094d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1095d71ae5a4SJacob Faibussowitsch { 1096076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1097a587d139SMark 1098a587d139SMark PetscFunctionBegin; 1099076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11002328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1101d326c3f1SJunchao Zhang KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 11029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11032328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 11049566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 11052328674fSJunchao Zhang } 11063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1107a587d139SMark } 1108a587d139SMark 1109d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1110d71ae5a4SJacob Faibussowitsch { 1111f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1112f78ce678SMark Adams PetscInt n; 1113f78ce678SMark Adams PetscScalarKokkosView xv; 1114f78ce678SMark Adams 1115f78ce678SMark Adams PetscFunctionBegin; 1116f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1117f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1118f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1119f78ce678SMark Adams 1120f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1121f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1122f78ce678SMark Adams 1123f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1124f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1125f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1126f78ce678SMark Adams 1127f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 11289371c9d4SSatish Balay Kokkos::parallel_for( 1129d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1130f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1131f78ce678SMark Adams else xv(i) = 0; 1132f78ce678SMark Adams }); 1133f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 11343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1135f78ce678SMark Adams } 1136f78ce678SMark Adams 1137db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1138d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1139d71ae5a4SJacob Faibussowitsch { 1140db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1141db78de30SJunchao Zhang 1142db78de30SJunchao Zhang PetscFunctionBegin; 1143db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11444f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1145db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1147db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1148076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1150db78de30SJunchao Zhang } 1151db78de30SJunchao Zhang 1152d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1153d71ae5a4SJacob Faibussowitsch { 1154db78de30SJunchao Zhang PetscFunctionBegin; 1155db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11564f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1157db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1159db78de30SJunchao Zhang } 1160db78de30SJunchao Zhang 1161d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1162d71ae5a4SJacob Faibussowitsch { 1163db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1164db78de30SJunchao Zhang 1165db78de30SJunchao Zhang PetscFunctionBegin; 1166db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11674f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1168db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1170db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1171076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1173db78de30SJunchao Zhang } 1174db78de30SJunchao Zhang 1175d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1176d71ae5a4SJacob Faibussowitsch { 1177db78de30SJunchao Zhang PetscFunctionBegin; 1178db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11794f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1180db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1183db78de30SJunchao Zhang } 1184db78de30SJunchao Zhang 1185d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1186d71ae5a4SJacob Faibussowitsch { 1187db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1188db78de30SJunchao Zhang 1189db78de30SJunchao Zhang PetscFunctionBegin; 1190db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11914f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1192db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1193db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1194076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1196db78de30SJunchao Zhang } 1197db78de30SJunchao Zhang 1198d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1199d71ae5a4SJacob Faibussowitsch { 1200db78de30SJunchao Zhang PetscFunctionBegin; 1201db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12024f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1203db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1206db78de30SJunchao Zhang } 1207db78de30SJunchao Zhang 1208c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1209d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1210d71ae5a4SJacob Faibussowitsch { 1211a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1212c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1213c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1214c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1215d326c3f1SJunchao Zhang auto &exec = PetscGetKokkosExecutionSpace(); 1216a587d139SMark 1217a587d139SMark PetscFunctionBegin; 1218c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1219c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 12209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 12219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 12229566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1223db78de30SJunchao Zhang 1224c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1225a587d139SMark PetscBool e; 12269566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1227a587d139SMark if (e) { 12289566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1229c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1230a587d139SMark } 1231a587d139SMark } 1232db78de30SJunchao Zhang 1233c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1234c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1235c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1236c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1237c17cf699SJunchao Zhang */ 1238c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1239c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1240c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1241c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1242c17cf699SJunchao Zhang 1243c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1244d326c3f1SJunchao Zhang KokkosBlas::axpy(exec, alpha, Xa, Ya); 12459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1246c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1247c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1248c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1249c17cf699SJunchao Zhang 12509371c9d4SSatish Balay Kokkos::parallel_for( 1251d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 12520e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 12530e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 12540e3ece09SJunchao Zhang // Only one thread works in a team 1255c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 12560e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 12570e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 12580e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1259c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1260c17cf699SJunchao Zhang q++; 1261a587d139SMark } else { 12620e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 12630e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 12640e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 12650e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 12668b8b16f9SJunchao Zhang #else 12670e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 12688b8b16f9SJunchao Zhang #endif 1269a587d139SMark } 1270c17cf699SJunchao Zhang } 1271c17cf699SJunchao Zhang }); 1272c17cf699SJunchao Zhang }); 12739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 12740e3ece09SJunchao Zhang } else { // different nonzero patterns 1275c17cf699SJunchao Zhang Mat Z; 1276c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1277c17cf699SJunchao Zhang KernelHandle kh; 12780e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1279c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1280c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1281c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 12829566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 12839566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1284c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1285c17cf699SJunchao Zhang } 12869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12870e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 12883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1289a587d139SMark } 1290a587d139SMark 12912c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 12922c4ab24aSJunchao Zhang PetscCount n; 12932c4ab24aSJunchao Zhang PetscCount Atot; 12942c4ab24aSJunchao Zhang PetscInt nz; 12952c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 12962c4ab24aSJunchao Zhang PetscCountKokkosView perm; 12972c4ab24aSJunchao Zhang 12982c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 12992c4ab24aSJunchao Zhang { 13002c4ab24aSJunchao Zhang nz = coo_h->nz; 13012c4ab24aSJunchao Zhang n = coo_h->n; 13022c4ab24aSJunchao Zhang Atot = coo_h->Atot; 13032c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 13042c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 13052c4ab24aSJunchao Zhang } 13062c4ab24aSJunchao Zhang }; 13072c4ab24aSJunchao Zhang 130849abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 13092c4ab24aSJunchao Zhang { 13102c4ab24aSJunchao Zhang PetscFunctionBegin; 131149abdd8aSBarry Smith PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 13122c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13132c4ab24aSJunchao Zhang } 13142c4ab24aSJunchao Zhang 1315d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1316d71ae5a4SJacob Faibussowitsch { 131742550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 131842550becSJunchao Zhang Mat_SeqAIJ *aseq; 131903e76207SPierre Jolivet PetscContainer container_h; 13202c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 13212c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 132242550becSJunchao Zhang 132342550becSJunchao Zhang PetscFunctionBegin; 13249566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1325394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 132642550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1327cbc6b225SStefano Zampini delete akok; 1328f4747e26SJunchao Zhang mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 13299566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 13302c4ab24aSJunchao Zhang 13312c4ab24aSJunchao Zhang // Copy the COO struct to device 13322c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 13332c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 13342c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 13352c4ab24aSJunchao Zhang 13362c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 133703e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 13383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 133942550becSJunchao Zhang } 134042550becSJunchao Zhang 1341d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1342d71ae5a4SJacob Faibussowitsch { 134342550becSJunchao Zhang MatScalarKokkosView Aa; 134442550becSJunchao Zhang ConstMatScalarKokkosView kv; 134542550becSJunchao Zhang PetscMemType memtype; 13462c4ab24aSJunchao Zhang PetscContainer container; 13472c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 134842550becSJunchao Zhang 134942550becSJunchao Zhang PetscFunctionBegin; 13502c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 13512c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 13522c4ab24aSJunchao Zhang 13532c4ab24aSJunchao Zhang const auto &n = coo->n; 13542c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 13552c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 13562c4ab24aSJunchao Zhang const auto &perm = coo->perm; 13572c4ab24aSJunchao Zhang 13589566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 135942550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 13602c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 136142550becSJunchao Zhang } else { 13622c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 136342550becSJunchao Zhang } 136442550becSJunchao Zhang 1365c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1366c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 136742550becSJunchao Zhang 136808bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 13699371c9d4SSatish Balay Kokkos::parallel_for( 1370d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1371c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1372c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1373c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1374c7b718f4SJunchao Zhang }); 137508bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1376394ed5ebSJunchao Zhang 13779566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 13789566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 13793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138042550becSJunchao Zhang } 138142550becSJunchao Zhang 1382d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1383d71ae5a4SJacob Faibussowitsch { 13848f7e8f9dSMark Adams PetscFunctionBegin; 13859566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 13869566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 13878f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 13883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13898f7e8f9dSMark Adams } 13908f7e8f9dSMark Adams 1391d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1392d71ae5a4SJacob Faibussowitsch { 1393076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1394076ba34aSJunchao Zhang 13958c3ff71bSJunchao Zhang PetscFunctionBegin; 1396076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 13976f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 13986f3d89d0SStefano Zampini 13998c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 14008c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 14018c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1402a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1403f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1404a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1405076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 14068c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 14078c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 14088c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 14098c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 14108c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 14118c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1412076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 14130ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1414152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1415f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1416f4747e26SJunchao Zhang A->ops->shift = MatShift_SeqAIJKokkos; 1417f4747e26SJunchao Zhang A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1418f4747e26SJunchao Zhang A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1419076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1420076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1421076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1422076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1423076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1424076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 14257ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 142642550becSJunchao Zhang 14279566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 14289566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 142957761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 143057761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 143157761e9aSJunchao Zhang #endif 14323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1433076ba34aSJunchao Zhang } 1434076ba34aSJunchao Zhang 14359d13fa56SJunchao Zhang /* 14369d13fa56SJunchao Zhang Extract the (prescribled) diagonal blocks of the matrix and then invert them 14379d13fa56SJunchao Zhang 14389d13fa56SJunchao Zhang Input Parameters: 14399d13fa56SJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 14409d13fa56SJunchao Zhang . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 14419d13fa56SJunchao Zhang . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 14429d13fa56SJunchao Zhang . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 14439d13fa56SJunchao Zhang - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 14449d13fa56SJunchao Zhang 14459d13fa56SJunchao Zhang Output Parameter: 14469d13fa56SJunchao Zhang . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 14479d13fa56SJunchao Zhang */ 14489d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 14499d13fa56SJunchao Zhang { 14509d13fa56SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 14519d13fa56SJunchao Zhang PetscInt nblocks = bs.extent(0) - 1; 14529d13fa56SJunchao Zhang 14539d13fa56SJunchao Zhang PetscFunctionBegin; 14549d13fa56SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 14559d13fa56SJunchao Zhang 14569d13fa56SJunchao Zhang // Pull out the diagonal blocks of the matrix and then invert the blocks 14579d13fa56SJunchao Zhang auto Aa = akok->a_dual.view_device(); 14589d13fa56SJunchao Zhang auto Ai = akok->i_dual.view_device(); 14599d13fa56SJunchao Zhang auto Aj = akok->j_dual.view_device(); 14609d13fa56SJunchao Zhang auto Adiag = akok->diag_dual.view_device(); 14619d13fa56SJunchao Zhang // TODO: how to tune the team size? 14629d13fa56SJunchao Zhang #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HOST) 14639d13fa56SJunchao Zhang auto ts = Kokkos::AUTO(); 14649d13fa56SJunchao Zhang #else 14659d13fa56SJunchao Zhang auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 14669d13fa56SJunchao Zhang #endif 14679d13fa56SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1468d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 14699d13fa56SJunchao Zhang const PetscInt bid = teamMember.league_rank(); // block id 14709d13fa56SJunchao Zhang const PetscInt rstart = bs(bid); // this block starts from this row 14719d13fa56SJunchao Zhang const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 14729d13fa56SJunchao Zhang const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 14739d13fa56SJunchao Zhang const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 14749d13fa56SJunchao Zhang 14759d13fa56SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 14769d13fa56SJunchao Zhang PetscInt i = rstart + r; // i-th row in A 14779d13fa56SJunchao Zhang 14789d13fa56SJunchao Zhang if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 14799d13fa56SJunchao Zhang PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 14809d13fa56SJunchao Zhang 14819d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 14829d13fa56SJunchao Zhang if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 14839d13fa56SJunchao Zhang B(r, c) = 0.0; 14849d13fa56SJunchao Zhang } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 14859d13fa56SJunchao Zhang B(r, c) = Aa(first + c); 14869d13fa56SJunchao Zhang } else { // this entry does not show up in the CSR 14879d13fa56SJunchao Zhang B(r, c) = 0.0; 14889d13fa56SJunchao Zhang } 14899d13fa56SJunchao Zhang } 14909d13fa56SJunchao Zhang } else { // rare case that the diagonal does not exist 14919d13fa56SJunchao Zhang const PetscInt begin = Ai(i); 14929d13fa56SJunchao Zhang const PetscInt end = Ai(i + 1); 14939d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 14949d13fa56SJunchao Zhang for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 14959d13fa56SJunchao Zhang if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 14969d13fa56SJunchao Zhang else if (Aj(j) >= rstart + m) break; 14979d13fa56SJunchao Zhang } 14989d13fa56SJunchao Zhang } 14999d13fa56SJunchao Zhang }); 15009d13fa56SJunchao Zhang 15019d13fa56SJunchao Zhang // LU-decompose B (w/o pivoting) and then invert B 15029d13fa56SJunchao Zhang KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 15039d13fa56SJunchao Zhang KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 15049d13fa56SJunchao Zhang })); 15059d13fa56SJunchao Zhang // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 15069d13fa56SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15079d13fa56SJunchao Zhang } 15089d13fa56SJunchao Zhang 1509d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1510d71ae5a4SJacob Faibussowitsch { 1511076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1512076ba34aSJunchao Zhang PetscInt i, m, n; 1513e36ced11SJunchao Zhang auto &exec = PetscGetKokkosExecutionSpace(); 1514076ba34aSJunchao Zhang 1515076ba34aSJunchao Zhang PetscFunctionBegin; 15165f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1517076ba34aSJunchao Zhang 1518076ba34aSJunchao Zhang m = akok->nrows(); 1519076ba34aSJunchao Zhang n = akok->ncols(); 15209566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 15219566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1522076ba34aSJunchao Zhang 1523076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 15249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 152557508eceSPierre Jolivet aseq = (Mat_SeqAIJ *)A->data; 1526076ba34aSJunchao Zhang 1527e36ced11SJunchao Zhang PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1528e36ced11SJunchao Zhang PetscCallCXX(akok->j_dual.sync_host(exec)); 1529e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1530076ba34aSJunchao Zhang 1531076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1532076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1533076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1534076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1535076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1536076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1537076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1538076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1539076ba34aSJunchao Zhang 15409566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 15419566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1542ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1543076ba34aSJunchao Zhang 1544076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1545076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1546ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 15479566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 15489566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 15493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1550076ba34aSJunchao Zhang } 1551076ba34aSJunchao Zhang 15520e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 15530e3ece09SJunchao Zhang { 15540e3ece09SJunchao Zhang PetscFunctionBegin; 15550e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 15560e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 15570e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15580e3ece09SJunchao Zhang } 15590e3ece09SJunchao Zhang 15600e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 15610e3ece09SJunchao Zhang { 15620e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 15634d86920dSPierre Jolivet 15640e3ece09SJunchao Zhang PetscFunctionBegin; 15650e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 15660e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 15670e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 15680e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15690e3ece09SJunchao Zhang } 15700e3ece09SJunchao Zhang 1571076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1572076ba34aSJunchao Zhang 1573076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1574076ba34aSJunchao Zhang */ 1575d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1576d71ae5a4SJacob Faibussowitsch { 1577076ba34aSJunchao Zhang PetscFunctionBegin; 15789566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 15799566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 15803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15818c3ff71bSJunchao Zhang } 15828c3ff71bSJunchao Zhang 1583152b3e56SJunchao Zhang /*@C 158411a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 15858c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 158620f4b53cSBarry Smith Kokkos for calculations. 15878c3ff71bSJunchao Zhang 15888c3ff71bSJunchao Zhang Collective 15898c3ff71bSJunchao Zhang 15908c3ff71bSJunchao Zhang Input Parameters: 159111a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 15928c3ff71bSJunchao Zhang . m - number of rows 15938c3ff71bSJunchao Zhang . n - number of columns 159420f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 159520f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 15968c3ff71bSJunchao Zhang 15978c3ff71bSJunchao Zhang Output Parameter: 15988c3ff71bSJunchao Zhang . A - the matrix 15998c3ff71bSJunchao Zhang 16002ef1f0ffSBarry Smith Level: intermediate 16012ef1f0ffSBarry Smith 16022ef1f0ffSBarry Smith Notes: 160311a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 16048c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 160511a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 16068c3ff71bSJunchao Zhang 160711a5261eSBarry Smith The AIJ format, also called 16082ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 16098c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 161020f4b53cSBarry Smith either one (as in Fortran) or zero. 16118c3ff71bSJunchao Zhang 16122ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 16132ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 16142ef1f0ffSBarry Smith allocation. 16158c3ff71bSJunchao Zhang 1616fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 16178c3ff71bSJunchao Zhang @*/ 1618d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1619d71ae5a4SJacob Faibussowitsch { 16208c3ff71bSJunchao Zhang PetscFunctionBegin; 16219566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 16229566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16239566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 16249566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 16259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 16263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16278c3ff71bSJunchao Zhang } 1628930e68a5SMark Adams 1629d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1630d71ae5a4SJacob Faibussowitsch { 1631930e68a5SMark Adams PetscFunctionBegin; 16329566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 163386a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 16343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 163586a27549SJunchao Zhang } 163686a27549SJunchao Zhang 1637d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1638d71ae5a4SJacob Faibussowitsch { 163986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 164086a27549SJunchao Zhang 164186a27549SJunchao Zhang PetscFunctionBegin; 164286a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 164386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 164486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 164586a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 164686a27549SJunchao Zhang } 16473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 164886a27549SJunchao Zhang } 164986a27549SJunchao Zhang 165086a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1651d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1652d71ae5a4SJacob Faibussowitsch { 165386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1654076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 165586a27549SJunchao Zhang 165686a27549SJunchao Zhang PetscFunctionBegin; 165786a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 165886a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 16597b8d4ba6SJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 16607b8d4ba6SJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 16617b8d4ba6SJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 166286a27549SJunchao Zhang 16639371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 166486a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 166586a27549SJunchao Zhang 166686a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 166786a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 166886a27549SJunchao Zhang */ 16699371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 167086a27549SJunchao Zhang 167186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 167286a27549SJunchao Zhang 167386a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 16747b8d4ba6SJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 16757b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 16767b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 167786a27549SJunchao Zhang 16789371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 167986a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 168086a27549SJunchao Zhang 168186a27549SJunchao Zhang /* Sort indices. See comments above */ 16829371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 168386a27549SJunchao Zhang 168486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 168586a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 168686a27549SJunchao Zhang } 16873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168886a27549SJunchao Zhang } 168986a27549SJunchao Zhang 169086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1691d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1692d71ae5a4SJacob Faibussowitsch { 169386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 169486a27549SJunchao Zhang PetscScalarKokkosView xv; 169586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 169686a27549SJunchao Zhang 169786a27549SJunchao Zhang PetscFunctionBegin; 16989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 17009566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17019566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 170286a27549SJunchao Zhang /* Solve L tmpv = b */ 17039566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 170486a27549SJunchao Zhang /* Solve Ux = tmpv */ 17059566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 17069566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17079566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 171086a27549SJunchao Zhang } 171186a27549SJunchao Zhang 1712076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1713d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1714d71ae5a4SJacob Faibussowitsch { 171586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 171686a27549SJunchao Zhang PetscScalarKokkosView xv; 171786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 171886a27549SJunchao Zhang 171986a27549SJunchao Zhang PetscFunctionBegin; 17209566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 17229566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17239566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 172486a27549SJunchao Zhang /* Solve U^T tmpv = b */ 172586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 172686a27549SJunchao Zhang 172786a27549SJunchao Zhang /* Solve L^T x = tmpv */ 172886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 17299566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17309566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 173386a27549SJunchao Zhang } 173486a27549SJunchao Zhang 1735d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1736d71ae5a4SJacob Faibussowitsch { 173786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 173886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 173986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 174086a27549SJunchao Zhang 174186a27549SJunchao Zhang PetscFunctionBegin; 17429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1744076ba34aSJunchao Zhang 1745076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1746076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1747076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1748076ba34aSJunchao Zhang 1749076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 175086a27549SJunchao Zhang 175186a27549SJunchao Zhang B->assembled = PETSC_TRUE; 175286a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 175386a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 175486a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 175586a27549SJunchao Zhang B->ops->matsolve = NULL; 175686a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 175786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 175886a27549SJunchao Zhang 175986a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 176086a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 176186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1762eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 17639566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 176586a27549SJunchao Zhang } 176686a27549SJunchao Zhang 1767d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1768d71ae5a4SJacob Faibussowitsch { 176986a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 177086a27549SJunchao Zhang Mat_SeqAIJ *b; 177186a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 177286a27549SJunchao Zhang PetscInt fill_lev = info->levels; 177386a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 177486a27549SJunchao Zhang PetscInt n = A->rmap->n; 177586a27549SJunchao Zhang 177686a27549SJunchao Zhang PetscFunctionBegin; 17779566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 177886a27549SJunchao Zhang /* Rebuild factors */ 17799371c9d4SSatish Balay if (factors) { 17809371c9d4SSatish Balay factors->Destroy(); 17819371c9d4SSatish Balay } /* Destroy the old if it exists */ 17829371c9d4SSatish Balay else { 17839371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 17849371c9d4SSatish Balay } 178586a27549SJunchao Zhang 178686a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 178786a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 178886a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 178986a27549SJunchao Zhang 179086a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 179186a27549SJunchao Zhang 179286a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 179386a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 179486a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 179586a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 179686a27549SJunchao Zhang 179786a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1798076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1799076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1800076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 180186a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 180286a27549SJunchao Zhang 180386a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 180486a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 180586a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 180686a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 180786a27549SJunchao Zhang 180886a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 180986a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 181086a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 181186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 181286a27549SJunchao Zhang #else 181386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 181486a27549SJunchao Zhang #endif 181586a27549SJunchao Zhang 181686a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 181786a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 181886a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 181986a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 182086a27549SJunchao Zhang 182186a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 18229566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 182386a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 182486a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 182586a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1826a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 182786a27549SJunchao Zhang 182886a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 182986a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 18303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1831930e68a5SMark Adams } 1832930e68a5SMark Adams 1833d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1834d71ae5a4SJacob Faibussowitsch { 1835930e68a5SMark Adams PetscFunctionBegin; 1836930e68a5SMark Adams *type = MATSOLVERKOKKOS; 18373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1838930e68a5SMark Adams } 1839930e68a5SMark Adams 1840930e68a5SMark Adams /*MC 184186a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 184211a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1843930e68a5SMark Adams 1844930e68a5SMark Adams Level: beginner 1845930e68a5SMark Adams 18461cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1847930e68a5SMark Adams M*/ 184886a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1849930e68a5SMark Adams { 1850930e68a5SMark Adams PetscInt n = A->rmap->n; 1851930e68a5SMark Adams 1852930e68a5SMark Adams PetscFunctionBegin; 18539566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 18549566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1855930e68a5SMark Adams (*B)->factortype = ftype; 18569566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 18579566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1858930e68a5SMark Adams 18598f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18609566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 186186a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 186286a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 186386a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 18649566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 186586a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 186686a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 186798921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1868930e68a5SMark Adams 18699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1870f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 18713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1872930e68a5SMark Adams } 18738f7e8f9dSMark Adams 1874d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1875d71ae5a4SJacob Faibussowitsch { 187686a27549SJunchao Zhang PetscFunctionBegin; 18779566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18789566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 188086a27549SJunchao Zhang } 188186a27549SJunchao Zhang 1882076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1883d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1884d71ae5a4SJacob Faibussowitsch { 1885076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1886076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1887076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1888076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1889076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1890076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1891076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1892076ba34aSJunchao Zhang 1893076ba34aSJunchao Zhang PetscFunctionBegin; 18949566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1895076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 18969566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 189748a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 18989566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1899076ba34aSJunchao Zhang } 19003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1901076ba34aSJunchao Zhang } 1902