1e36ced11SJunchao Zhang #include <petsc_kokkos.hpp> 211d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 3076ba34aSJunchao Zhang #include <petscpkg_version.h> 4152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 542550becSJunchao Zhang #include <petsc/private/sfimpl.h> 68c3ff71bSJunchao Zhang #include <petscsystypes.h> 78c3ff71bSJunchao Zhang #include <petscerror.h> 88c3ff71bSJunchao Zhang 98c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 10f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 12cc6e31f1SJunchao Zhang 13cc6e31f1SJunchao Zhang // To suppress compiler warnings: 14cc6e31f1SJunchao Zhang // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63: 15cc6e31f1SJunchao Zhang // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, 16cc6e31f1SJunchao Zhang // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*, 17cc6e31f1SJunchao Zhang // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations] 18cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_BEGIN("-Wdeprecated-declarations") 198c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 20cc6e31f1SJunchao Zhang PETSC_PRAGMA_DIAGNOSTIC_IGNORED_END() 21cc6e31f1SJunchao Zhang 2286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 2386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 24076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 25076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 269d13fa56SJunchao Zhang #include <KokkosBatched_LU_Decl.hpp> 279d13fa56SJunchao Zhang #include <KokkosBatched_InverseLU_Decl.hpp> 2886a27549SJunchao Zhang 2942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 308c3ff71bSJunchao Zhang 310e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 32f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 33f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 349371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 35f98996d3SJunchao Zhang #else 36f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 37f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 389371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 39f98996d3SJunchao Zhang #endif 40f98996d3SJunchao Zhang 418c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 428c3ff71bSJunchao Zhang 43076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 44076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 45076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 46076ba34aSJunchao Zhang */ 47d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 48d71ae5a4SJacob Faibussowitsch { 49076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 50076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 518c3ff71bSJunchao Zhang 528c3ff71bSJunchao Zhang PetscFunctionBegin; 533ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 549566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 55076ba34aSJunchao Zhang 56076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 57076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 58076ba34aSJunchao Zhang 59076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 60076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 61076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 62076ba34aSJunchao Zhang */ 63076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 64076ba34aSJunchao Zhang delete aijkok; 65f4747e26SJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 66076ba34aSJunchao Zhang A->spptr = aijkok; 67f4747e26SJunchao Zhang } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag. 68f4747e26SJunchao Zhang MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n); 69f4747e26SJunchao Zhang auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h); 70f4747e26SJunchao Zhang aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h); 71076ba34aSJunchao Zhang } 723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 738c3ff71bSJunchao Zhang } 748c3ff71bSJunchao Zhang 7586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 76d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 77d71ae5a4SJacob Faibussowitsch { 788c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 798c3ff71bSJunchao Zhang 808c3ff71bSJunchao Zhang PetscFunctionBegin; 81aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 825f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 83076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 84076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 85580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 8686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 878c3ff71bSJunchao Zhang } 883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 898c3ff71bSJunchao Zhang } 908c3ff71bSJunchao Zhang 91076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 92d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 93d71ae5a4SJacob Faibussowitsch { 9486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 9586a27549SJunchao Zhang 9686a27549SJunchao Zhang PetscFunctionBegin; 975f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 9886a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 9986a27549SJunchao Zhang aijkok->a_dual.modify_device(); 10086a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 10186a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 1029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 1039566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 1043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10586a27549SJunchao Zhang } 10686a27549SJunchao Zhang 107d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 108d71ae5a4SJacob Faibussowitsch { 109f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 110*4df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 111f0cf5187SStefano Zampini 112f0cf5187SStefano Zampini PetscFunctionBegin; 113f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 115aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1165f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 117e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 118e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 120f0cf5187SStefano Zampini } 121f0cf5187SStefano Zampini 122d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 123d71ae5a4SJacob Faibussowitsch { 124076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 125f0cf5187SStefano Zampini 126f0cf5187SStefano Zampini PetscFunctionBegin; 1275519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1285519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1295519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1305519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1315519a089SJose E. Roman */ 1325519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 133*4df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 134e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 135e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 136076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 137076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 138076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 139076ba34aSJunchao Zhang } 1403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 141076ba34aSJunchao Zhang } 142076ba34aSJunchao Zhang 143d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 144d71ae5a4SJacob Faibussowitsch { 145076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 146076ba34aSJunchao Zhang 147076ba34aSJunchao Zhang PetscFunctionBegin; 1485519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 150076ba34aSJunchao Zhang } 151076ba34aSJunchao Zhang 152d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 153d71ae5a4SJacob Faibussowitsch { 154076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 155076ba34aSJunchao Zhang 156076ba34aSJunchao Zhang PetscFunctionBegin; 1575519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 158*4df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 159e36ced11SJunchao Zhang PetscCallCXX(aijkok->a_dual.sync_host(exec)); 160e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 161076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1622328674fSJunchao Zhang } else { 1632328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1642328674fSJunchao Zhang } 1653ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 166076ba34aSJunchao Zhang } 167076ba34aSJunchao Zhang 168d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 169d71ae5a4SJacob Faibussowitsch { 170076ba34aSJunchao Zhang PetscFunctionBegin; 171076ba34aSJunchao Zhang *array = NULL; 1723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 173076ba34aSJunchao Zhang } 174076ba34aSJunchao Zhang 175d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 176d71ae5a4SJacob Faibussowitsch { 177076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 178076ba34aSJunchao Zhang 179076ba34aSJunchao Zhang PetscFunctionBegin; 1805519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 181076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1822328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1832328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1842328674fSJunchao Zhang } 1853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 186076ba34aSJunchao Zhang } 187076ba34aSJunchao Zhang 188d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 189d71ae5a4SJacob Faibussowitsch { 190076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 191076ba34aSJunchao Zhang 192076ba34aSJunchao Zhang PetscFunctionBegin; 1935519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 194076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 195076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1962328674fSJunchao Zhang } 1973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 198f0cf5187SStefano Zampini } 199f0cf5187SStefano Zampini 200d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 201d71ae5a4SJacob Faibussowitsch { 2027ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2037ee59b9bSJunchao Zhang 2047ee59b9bSJunchao Zhang PetscFunctionBegin; 2057ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 2067ee59b9bSJunchao Zhang 2077ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 2087ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 2097ee59b9bSJunchao Zhang if (a) { 2107ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 2117ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 2127ee59b9bSJunchao Zhang } 2137ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 2143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2157ee59b9bSJunchao Zhang } 2167ee59b9bSJunchao Zhang 2170e3ece09SJunchao Zhang /* 2180e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2190e3ece09SJunchao Zhang 2200e3ece09SJunchao Zhang Input Parameter: 2210e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2220e3ece09SJunchao Zhang 2230e3ece09SJunchao Zhang Output Parameters: 2240e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 225aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2260e3ece09SJunchao Zhang */ 2270e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 228d71ae5a4SJacob Faibussowitsch { 2290e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2300e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2310e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2327b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1); 2330e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2347b8d4ba6SJunchao Zhang MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz); 2357b8d4ba6SJunchao Zhang MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz); 2360e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2370e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2380e3ece09SJunchao Zhang PetscInt *offset; 239152b3e56SJunchao Zhang 240152b3e56SJunchao Zhang PetscFunctionBegin; 2410e3ece09SJunchao Zhang // Populate Ti 2420e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2430e3ece09SJunchao Zhang Ti++; 2440e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2450e3ece09SJunchao Zhang Ti--; 2460e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2470e3ece09SJunchao Zhang 2480e3ece09SJunchao Zhang // Populate Tj and the permutation array 2490e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2500e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2510e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2520e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2530e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2540e3ece09SJunchao Zhang 2550e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2560e3ece09SJunchao Zhang perm[disp] = j; 2570e3ece09SJunchao Zhang offset[r]++; 258076ba34aSJunchao Zhang } 2590e3ece09SJunchao Zhang } 2600e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2610e3ece09SJunchao Zhang 2620e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2630e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2640e3ece09SJunchao Zhang 2650e3ece09SJunchao Zhang // Output perm and T on device 2660e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2670e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2680e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2690e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 271152b3e56SJunchao Zhang } 272152b3e56SJunchao Zhang 2730e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2740e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2750e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 276d71ae5a4SJacob Faibussowitsch { 2770e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2780e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2790e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2800e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 281152b3e56SJunchao Zhang 282152b3e56SJunchao Zhang PetscFunctionBegin; 2830e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 284145b44c9SPierre Jolivet PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 2850e3ece09SJunchao Zhang 2860e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2870e3ece09SJunchao Zhang 2880e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2890e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 2900e3ece09SJunchao Zhang } else { 2910e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 2920e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 2930e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 2940e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 2950e3ece09SJunchao Zhang auto &Ta = T.values; 2960e3ece09SJunchao Zhang 297d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 298076ba34aSJunchao Zhang } 2990e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3000e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3010e3ece09SJunchao Zhang 3020e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3030e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 304d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 3050e3ece09SJunchao Zhang } 3060e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 3070e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 3080e3ece09SJunchao Zhang } 3090e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3100e3ece09SJunchao Zhang } 3110e3ece09SJunchao Zhang 3120e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3130e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3140e3ece09SJunchao Zhang { 3150e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3160e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3170e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3180e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3190e3ece09SJunchao Zhang 3200e3ece09SJunchao Zhang PetscFunctionBegin; 3210e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3220e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3230e3ece09SJunchao Zhang 3240e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3250e3ece09SJunchao Zhang 3260e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3270e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3280e3ece09SJunchao Zhang } else { 3290e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3300e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3310e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3320e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3330e3ece09SJunchao Zhang auto &Ta = T.values; 3340e3ece09SJunchao Zhang 335d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3360e3ece09SJunchao Zhang } 3370e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3380e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3390e3ece09SJunchao Zhang 3400e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3410e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 342d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3430e3ece09SJunchao Zhang } 3440e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3450e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3460e3ece09SJunchao Zhang } 3473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 348152b3e56SJunchao Zhang } 349a587d139SMark 3508c3ff71bSJunchao Zhang /* y = A x */ 351d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 352d71ae5a4SJacob Faibussowitsch { 3538c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 354152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 355152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3568c3ff71bSJunchao Zhang 3578c3ff71bSJunchao Zhang PetscFunctionBegin; 3589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3609566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3619566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3628c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 363d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 366076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3679566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3689566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3708c3ff71bSJunchao Zhang } 3718c3ff71bSJunchao Zhang 3728c3ff71bSJunchao Zhang /* y = A^T x */ 373d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 374d71ae5a4SJacob Faibussowitsch { 3758c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 376152b3e56SJunchao Zhang const char *mode; 377152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 378152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3790e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3808c3ff71bSJunchao Zhang 3818c3ff71bSJunchao Zhang PetscFunctionBegin; 3829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3839566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3849566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3859566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 386152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 388152b3e56SJunchao Zhang mode = "N"; 389152b3e56SJunchao Zhang } else { 390076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3910e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 392152b3e56SJunchao Zhang mode = "T"; 393152b3e56SJunchao Zhang } 394d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3959566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3969566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3970e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 3989566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4008c3ff71bSJunchao Zhang } 4018c3ff71bSJunchao Zhang 4028c3ff71bSJunchao Zhang /* y = A^H x */ 403d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 404d71ae5a4SJacob Faibussowitsch { 4058c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 406152b3e56SJunchao Zhang const char *mode; 407152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 408152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4090e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4108c3ff71bSJunchao Zhang 4118c3ff71bSJunchao Zhang PetscFunctionBegin; 4129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4149566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4159566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 416152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4179566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 418152b3e56SJunchao Zhang mode = "N"; 419152b3e56SJunchao Zhang } else { 420076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4210e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 422152b3e56SJunchao Zhang mode = "C"; 423152b3e56SJunchao Zhang } 424d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4259566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4269566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4270e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4289566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4293ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4308c3ff71bSJunchao Zhang } 4318c3ff71bSJunchao Zhang 4328c3ff71bSJunchao Zhang /* z = A x + y */ 433d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 434d71ae5a4SJacob Faibussowitsch { 4358c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 43692896123SJunchao Zhang ConstPetscScalarKokkosView xv; 437152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4388c3ff71bSJunchao Zhang 4398c3ff71bSJunchao Zhang PetscFunctionBegin; 4409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 44292896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host 4439566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 44492896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device 4458c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 446d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4479566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 44892896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4499566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4528c3ff71bSJunchao Zhang } 4538c3ff71bSJunchao Zhang 4548c3ff71bSJunchao Zhang /* z = A^T x + y */ 455d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 456d71ae5a4SJacob Faibussowitsch { 4578c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 458152b3e56SJunchao Zhang const char *mode; 45992896123SJunchao Zhang ConstPetscScalarKokkosView xv; 460152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4610e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4628c3ff71bSJunchao Zhang 4638c3ff71bSJunchao Zhang PetscFunctionBegin; 4649566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 46692896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4679566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 46892896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 469152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 471152b3e56SJunchao Zhang mode = "N"; 472152b3e56SJunchao Zhang } else { 473076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4740e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 475152b3e56SJunchao Zhang mode = "T"; 476152b3e56SJunchao Zhang } 477d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4789566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 47992896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 4800e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4838c3ff71bSJunchao Zhang } 4848c3ff71bSJunchao Zhang 4858c3ff71bSJunchao Zhang /* z = A^H x + y */ 486d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 487d71ae5a4SJacob Faibussowitsch { 4888c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 489152b3e56SJunchao Zhang const char *mode; 49092896123SJunchao Zhang ConstPetscScalarKokkosView xv; 491152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4920e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4938c3ff71bSJunchao Zhang 4948c3ff71bSJunchao Zhang PetscFunctionBegin; 4959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4969566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 49792896123SJunchao Zhang if (zz != yy) PetscCall(VecCopy(yy, zz)); 4989566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 49992896123SJunchao Zhang PetscCall(VecGetKokkosView(zz, &zv)); 500152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 502152b3e56SJunchao Zhang mode = "N"; 503152b3e56SJunchao Zhang } else { 504076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5050e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 506152b3e56SJunchao Zhang mode = "C"; 507152b3e56SJunchao Zhang } 508d326c3f1SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5099566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 51092896123SJunchao Zhang PetscCall(VecRestoreKokkosView(zz, &zv)); 5110e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 514152b3e56SJunchao Zhang } 515152b3e56SJunchao Zhang 51666976f2fSJacob Faibussowitsch static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 517d71ae5a4SJacob Faibussowitsch { 518152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 519152b3e56SJunchao Zhang 520152b3e56SJunchao Zhang PetscFunctionBegin; 521152b3e56SJunchao Zhang switch (op) { 522152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 523152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5249566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 525152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 526152b3e56SJunchao Zhang break; 527d71ae5a4SJacob Faibussowitsch default: 528d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 529d71ae5a4SJacob Faibussowitsch break; 530152b3e56SJunchao Zhang } 5313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5328c3ff71bSJunchao Zhang } 5338c3ff71bSJunchao Zhang 534076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 535d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 536d71ae5a4SJacob Faibussowitsch { 537076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5388c3ff71bSJunchao Zhang 5398c3ff71bSJunchao Zhang PetscFunctionBegin; 5409566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 541076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5429566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5438c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5449566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 545076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5465f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5479566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5489566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5499566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5509566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 551076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 552394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5535f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 554f4747e26SJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE); 5558c3ff71bSJunchao Zhang } 556076ba34aSJunchao Zhang } 5573ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5588c3ff71bSJunchao Zhang } 5598c3ff71bSJunchao Zhang 560076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 561076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 562076ba34aSJunchao Zhang */ 563d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 564d71ae5a4SJacob Faibussowitsch { 565076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 566076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 567076ba34aSJunchao Zhang Mat mat; 5688c3ff71bSJunchao Zhang 5698c3ff71bSJunchao Zhang PetscFunctionBegin; 570076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5719566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 572076ba34aSJunchao Zhang mat = *B; 573f4747e26SJunchao Zhang if (A->assembled) { 574076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 575f4747e26SJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE); 576076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 577076ba34aSJunchao Zhang /* Now copy values to B if needed */ 578076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 579076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 580076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 581076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 582076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 583076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 584076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 585076ba34aSJunchao Zhang } 586076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 587076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 588076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 589076ba34aSJunchao Zhang } 590076ba34aSJunchao Zhang mat->spptr = bkok; 591076ba34aSJunchao Zhang } 592076ba34aSJunchao Zhang 5939566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5949566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5959566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5969566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5988c3ff71bSJunchao Zhang } 5998c3ff71bSJunchao Zhang 600d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 601d71ae5a4SJacob Faibussowitsch { 6020ecb592aSJunchao Zhang Mat At; 6030e3ece09SJunchao Zhang KokkosCsrMatrix internT; 6040ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 6050ecb592aSJunchao Zhang 6060ecb592aSJunchao Zhang PetscFunctionBegin; 6077fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 6089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6090ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 610ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6110e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6129566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6130ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6149566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6150ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6160ecb592aSJunchao Zhang if ((*B)->assembled) { 6170ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6180e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6200ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6210ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6220e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6230e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6240e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6250e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6260ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6270ecb592aSJunchao Zhang } 6283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6290ecb592aSJunchao Zhang } 6300ecb592aSJunchao Zhang 631d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 632d71ae5a4SJacob Faibussowitsch { 63386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6348c3ff71bSJunchao Zhang 6358c3ff71bSJunchao Zhang PetscFunctionBegin; 63686a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 63786a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6388c3ff71bSJunchao Zhang delete aijkok; 63986a27549SJunchao Zhang } else { 64086a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 64186a27549SJunchao Zhang } 642cbc6b225SStefano Zampini A->spptr = NULL; 6439566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6449566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6459566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 64657761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 64757761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL)); 64857761e9aSJunchao Zhang #endif 6499566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6503ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6518c3ff71bSJunchao Zhang } 6528c3ff71bSJunchao Zhang 6533f3ba80aSJunchao Zhang /*MC 6543f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6553f3ba80aSJunchao Zhang 65615229ffcSPierre Jolivet A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types 6573f3ba80aSJunchao Zhang 6582ef1f0ffSBarry Smith Options Database Key: 65911a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6603f3ba80aSJunchao Zhang 6613f3ba80aSJunchao Zhang Level: beginner 6623f3ba80aSJunchao Zhang 6631cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6643f3ba80aSJunchao Zhang M*/ 665d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 666d71ae5a4SJacob Faibussowitsch { 66786a27549SJunchao Zhang PetscFunctionBegin; 6689566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6699566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6709566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 67286a27549SJunchao Zhang } 67386a27549SJunchao Zhang 674076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 675d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 676d71ae5a4SJacob Faibussowitsch { 677076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 678076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 679076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 680076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 681076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 682076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 683a3f881fbSStefano Zampini 684a3f881fbSStefano Zampini PetscFunctionBegin; 685076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 686076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 6874f572ea9SToby Isaac PetscAssertPointer(C, 4); 688076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 689076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6905f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6915f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 692076ba34aSJunchao Zhang 6939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 695076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 696076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 697076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 698076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 699076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 700076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 701076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 702076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 703076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 704076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 705076ba34aSJunchao Zhang nnz = a->nz + b->nz; 706076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 707076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 708076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 709076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 710076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 711076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 712076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 713076ba34aSJunchao Zhang ca = ca_dual.view_device(); 714076ba34aSJunchao Zhang ci = ci_dual.view_device(); 715076ba34aSJunchao Zhang cj = cj_dual.view_device(); 716076ba34aSJunchao Zhang 717076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7189371c9d4SSatish Balay Kokkos::parallel_for( 719d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 720076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 721076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 722076ba34aSJunchao Zhang 723076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 724076ba34aSJunchao Zhang ci(i) = coffset; 725076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 726076ba34aSJunchao Zhang }); 727076ba34aSJunchao Zhang 728076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 729076ba34aSJunchao Zhang if (k < alen) { 730076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 731076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 732076ba34aSJunchao Zhang } else { 733076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 734076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 735076ba34aSJunchao Zhang } 736076ba34aSJunchao Zhang }); 737076ba34aSJunchao Zhang }); 738076ba34aSJunchao Zhang ca_dual.modify_device(); 739076ba34aSJunchao Zhang ci_dual.modify_device(); 740076ba34aSJunchao Zhang cj_dual.modify_device(); 7419566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7429566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 743076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 744076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 745076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 746076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 747076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 748076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 749076ba34aSJunchao Zhang 7509371c9d4SSatish Balay Kokkos::parallel_for( 751d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 752076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 753076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 754076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 755076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 756076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 757076ba34aSJunchao Zhang }); 758076ba34aSJunchao Zhang }); 7599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 760076ba34aSJunchao Zhang } 7613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 762076ba34aSJunchao Zhang } 763076ba34aSJunchao Zhang 764d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 765d71ae5a4SJacob Faibussowitsch { 766076ba34aSJunchao Zhang PetscFunctionBegin; 767076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7683ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 769a3f881fbSStefano Zampini } 770a3f881fbSStefano Zampini 771d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 772d71ae5a4SJacob Faibussowitsch { 773a3f881fbSStefano Zampini Mat_Product *product = C->product; 774a3f881fbSStefano Zampini Mat A, B; 775076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 776a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 777a3f881fbSStefano Zampini Mat_SeqAIJ *c; 778076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7790e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 780a3f881fbSStefano Zampini 781a3f881fbSStefano Zampini PetscFunctionBegin; 782a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7835f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 784076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 785076ba34aSJunchao Zhang 7860e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 7870e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 7880e3ece09SJunchao Zhang // we still do numeric. 7890e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 7900e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 7913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 792076ba34aSJunchao Zhang } 793076ba34aSJunchao Zhang 794076ba34aSJunchao Zhang switch (product->type) { 7959371c9d4SSatish Balay case MATPRODUCT_AB: 7969371c9d4SSatish Balay transA = false; 7979371c9d4SSatish Balay transB = false; 7989371c9d4SSatish Balay break; 7999371c9d4SSatish Balay case MATPRODUCT_AtB: 8009371c9d4SSatish Balay transA = true; 8019371c9d4SSatish Balay transB = false; 8029371c9d4SSatish Balay break; 8039371c9d4SSatish Balay case MATPRODUCT_ABt: 8049371c9d4SSatish Balay transA = false; 8059371c9d4SSatish Balay transB = true; 8069371c9d4SSatish Balay break; 807d71ae5a4SJacob Faibussowitsch default: 808d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 809076ba34aSJunchao Zhang } 810076ba34aSJunchao Zhang 811a3f881fbSStefano Zampini A = product->A; 812a3f881fbSStefano Zampini B = product->B; 8139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 815a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 816a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 817a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 818076ba34aSJunchao Zhang 8195f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 820076ba34aSJunchao Zhang 8210e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8220e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 823076ba34aSJunchao Zhang 824076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 825076ba34aSJunchao Zhang if (transA) { 8269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 827076ba34aSJunchao Zhang transA = false; 828a3f881fbSStefano Zampini } 829a3f881fbSStefano Zampini 830076ba34aSJunchao Zhang if (transB) { 8319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 832076ba34aSJunchao Zhang transB = false; 833076ba34aSJunchao Zhang } 8349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8350e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8360e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 837866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 838866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 839e944a159SJunchao Zhang #endif 840866eb059SJunchao Zhang 8419566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 843a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 844a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8459566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8469566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8479566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 848a3f881fbSStefano Zampini c->reallocs = 0; 849076ba34aSJunchao Zhang C->info.mallocs = 0; 850a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 851a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 852a3f881fbSStefano Zampini C->num_ass++; 8533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 854a3f881fbSStefano Zampini } 855a3f881fbSStefano Zampini 856d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 857d71ae5a4SJacob Faibussowitsch { 858076ba34aSJunchao Zhang Mat_Product *product = C->product; 859076ba34aSJunchao Zhang MatProductType ptype; 860076ba34aSJunchao Zhang Mat A, B; 861076ba34aSJunchao Zhang bool transA, transB; 862076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 863076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 864076ba34aSJunchao Zhang MPI_Comm comm; 8650e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 866a3f881fbSStefano Zampini 867a3f881fbSStefano Zampini PetscFunctionBegin; 868a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8699566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8705f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 871a3f881fbSStefano Zampini A = product->A; 872a3f881fbSStefano Zampini B = product->B; 8739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 875a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 876a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8770e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8780e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 879076ba34aSJunchao Zhang 880a3f881fbSStefano Zampini ptype = product->type; 8810e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8820e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 8830e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8840e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 8850e3ece09SJunchao Zhang } 8860e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 8870e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 8880e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 8890e3ece09SJunchao Zhang } 8900e3ece09SJunchao Zhang 891a3f881fbSStefano Zampini switch (ptype) { 8929371c9d4SSatish Balay case MATPRODUCT_AB: 8939371c9d4SSatish Balay transA = false; 8949371c9d4SSatish Balay transB = false; 8950e6a1e94SMark Adams PetscCall(MatSetBlockSizesFromMats(C, A, B)); 8969371c9d4SSatish Balay break; 8979371c9d4SSatish Balay case MATPRODUCT_AtB: 8989371c9d4SSatish Balay transA = true; 8999371c9d4SSatish Balay transB = false; 9000e6a1e94SMark Adams if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs)); 9010e6a1e94SMark Adams if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs)); 9029371c9d4SSatish Balay break; 9039371c9d4SSatish Balay case MATPRODUCT_ABt: 9049371c9d4SSatish Balay transA = false; 9059371c9d4SSatish Balay transB = true; 9060e6a1e94SMark Adams if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs)); 9070e6a1e94SMark Adams if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs)); 9089371c9d4SSatish Balay break; 909d71ae5a4SJacob Faibussowitsch default: 910d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 911a3f881fbSStefano Zampini } 9120e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 913076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 914a3f881fbSStefano Zampini 915076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 916866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 917866eb059SJunchao Zhang 918866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 919866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 920866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 921866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 922866eb059SJunchao Zhang #endif 923866eb059SJunchao Zhang #endif 9240e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 925076ba34aSJunchao Zhang 9269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 927076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 928076ba34aSJunchao Zhang if (transA) { 9299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 930076ba34aSJunchao Zhang transA = false; 931076ba34aSJunchao Zhang } 932076ba34aSJunchao Zhang 933076ba34aSJunchao Zhang if (transB) { 9349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 935076ba34aSJunchao Zhang transB = false; 936076ba34aSJunchao Zhang } 937076ba34aSJunchao Zhang 9380e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 939076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 940076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 941076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 942076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 943076ba34aSJunchao Zhang */ 9440e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9450e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 946866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 947866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 948866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 949e944a159SJunchao Zhang #endif 9509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 951076ba34aSJunchao Zhang 9529566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9539566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 954076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 956a3f881fbSStefano Zampini } 957a3f881fbSStefano Zampini 958a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 959d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 960d71ae5a4SJacob Faibussowitsch { 961076ba34aSJunchao Zhang Mat_Product *product = mat->product; 962a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 963a3f881fbSStefano Zampini 964a3f881fbSStefano Zampini PetscFunctionBegin; 965a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9669566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 96748a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 968a3f881fbSStefano Zampini if (Biskok && Ciskok) { 969a3f881fbSStefano Zampini switch (product->type) { 970a3f881fbSStefano Zampini case MATPRODUCT_AB: 971a3f881fbSStefano Zampini case MATPRODUCT_AtB: 972d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 973d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 974d71ae5a4SJacob Faibussowitsch break; 975a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 976a3f881fbSStefano Zampini case MATPRODUCT_RARt: 977d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 978d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 979d71ae5a4SJacob Faibussowitsch break; 980d71ae5a4SJacob Faibussowitsch default: 981d71ae5a4SJacob Faibussowitsch break; 982a3f881fbSStefano Zampini } 983a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9849566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 985a3f881fbSStefano Zampini } 9863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 987a3f881fbSStefano Zampini } 988a587d139SMark 989d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 990d71ae5a4SJacob Faibussowitsch { 991f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 992f0cf5187SStefano Zampini 993f0cf5187SStefano Zampini PetscFunctionBegin; 9949566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 996f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 997d326c3f1SJunchao Zhang KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 10009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1002f0cf5187SStefano Zampini } 1003f0cf5187SStefano Zampini 1004f4747e26SJunchao Zhang // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular) 1005f4747e26SJunchao Zhang static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a) 1006f4747e26SJunchao Zhang { 1007f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1008f4747e26SJunchao Zhang 1009f4747e26SJunchao Zhang PetscFunctionBegin; 1010f4747e26SJunchao Zhang if (A->assembled && aijseq->diagonaldense) { // no missing diagonals 1011f4747e26SJunchao Zhang PetscInt n = PetscMin(A->rmap->n, A->cmap->n); 1012f4747e26SJunchao Zhang 1013f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1014f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1015f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1016f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1017f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1018d326c3f1SJunchao Zhang PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; })); 1019f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1020f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1021f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1022f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1023f4747e26SJunchao Zhang PetscCall(MatShift_Basic(A, a)); 1024f4747e26SJunchao Zhang } 1025f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1026f4747e26SJunchao Zhang } 1027f4747e26SJunchao Zhang 1028f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is) 1029f4747e26SJunchao Zhang { 1030f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data); 1031f4747e26SJunchao Zhang 1032f4747e26SJunchao Zhang PetscFunctionBegin; 1033f4747e26SJunchao Zhang if (Y->assembled && aijseq->diagonaldense) { // no missing diagonals 1034f4747e26SJunchao Zhang ConstPetscScalarKokkosView dv; 1035f4747e26SJunchao Zhang PetscInt n, nv; 1036f4747e26SJunchao Zhang 1037f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1038f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 1039f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(D, &dv)); 1040f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(D, &nv)); 1041f4747e26SJunchao Zhang n = PetscMin(Y->rmap->n, Y->cmap->n); 1042f4747e26SJunchao Zhang PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match"); 1043f4747e26SJunchao Zhang 1044f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1045f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1046f4747e26SJunchao Zhang const auto &Adiag = aijkok->diag_dual.view_device(); 1047f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1048d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1049f4747e26SJunchao Zhang if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i); 1050f4747e26SJunchao Zhang else Aa(Adiag(i)) += dv(i); 1051f4747e26SJunchao Zhang })); 1052f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(D, &dv)); 1053f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1054f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(n)); 1055f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1056f4747e26SJunchao Zhang } else { // need reassembly, very slow! 1057f4747e26SJunchao Zhang PetscCall(MatDiagonalSet_Default(Y, D, is)); 1058f4747e26SJunchao Zhang } 1059f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1060f4747e26SJunchao Zhang } 1061f4747e26SJunchao Zhang 1062f4747e26SJunchao Zhang static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr) 1063f4747e26SJunchao Zhang { 1064f4747e26SJunchao Zhang Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1065f4747e26SJunchao Zhang PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz; 1066f4747e26SJunchao Zhang ConstPetscScalarKokkosView lv, rv; 1067f4747e26SJunchao Zhang 1068f4747e26SJunchao Zhang PetscFunctionBegin; 1069f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 1070f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1071f4747e26SJunchao Zhang const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1072f4747e26SJunchao Zhang const auto &Aa = aijkok->a_dual.view_device(); 1073f4747e26SJunchao Zhang const auto &Ai = aijkok->i_dual.view_device(); 1074f4747e26SJunchao Zhang const auto &Aj = aijkok->j_dual.view_device(); 1075f4747e26SJunchao Zhang if (ll) { 1076f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(ll, &m)); 1077f4747e26SJunchao Zhang PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length"); 1078f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(ll, &lv)); 1079f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each row 1080d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1081f4747e26SJunchao Zhang PetscInt i = t.league_rank(); // row i 1082f4747e26SJunchao Zhang PetscInt len = Ai(i + 1) - Ai(i); 1083f4747e26SJunchao Zhang // scale entries on the row 1084f4747e26SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); }); 1085f4747e26SJunchao Zhang })); 1086f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(ll, &lv)); 1087f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1088f4747e26SJunchao Zhang } 1089f4747e26SJunchao Zhang if (rr) { 1090f4747e26SJunchao Zhang PetscCall(VecGetLocalSize(rr, &n)); 1091f4747e26SJunchao Zhang PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length"); 1092f4747e26SJunchao Zhang PetscCall(VecGetKokkosView(rr, &rv)); 1093f4747e26SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( // for each nonzero 1094d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); })); 1095f4747e26SJunchao Zhang PetscCall(VecRestoreKokkosView(rr, &lv)); 1096f4747e26SJunchao Zhang PetscCall(PetscLogGpuFlops(nz)); 1097f4747e26SJunchao Zhang } 1098f4747e26SJunchao Zhang PetscCall(MatSeqAIJKokkosModifyDevice(A)); 1099f4747e26SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1100f4747e26SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 1101f4747e26SJunchao Zhang } 1102f4747e26SJunchao Zhang 1103d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1104d71ae5a4SJacob Faibussowitsch { 1105076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1106a587d139SMark 1107a587d139SMark PetscFunctionBegin; 1108076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11092328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1110d326c3f1SJunchao Zhang KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0); 11119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11122328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 11139566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 11142328674fSJunchao Zhang } 11153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1116a587d139SMark } 1117a587d139SMark 1118d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1119d71ae5a4SJacob Faibussowitsch { 1120f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1121f78ce678SMark Adams PetscInt n; 1122f78ce678SMark Adams PetscScalarKokkosView xv; 1123f78ce678SMark Adams 1124f78ce678SMark Adams PetscFunctionBegin; 1125f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1126f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1127f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1128f78ce678SMark Adams 1129f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1130f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1131f78ce678SMark Adams 1132f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1133f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1134f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1135f78ce678SMark Adams 1136f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 11379371c9d4SSatish Balay Kokkos::parallel_for( 1138d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { 1139f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1140f78ce678SMark Adams else xv(i) = 0; 1141f78ce678SMark Adams }); 1142f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 11433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1144f78ce678SMark Adams } 1145f78ce678SMark Adams 1146db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1147d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1148d71ae5a4SJacob Faibussowitsch { 1149db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1150db78de30SJunchao Zhang 1151db78de30SJunchao Zhang PetscFunctionBegin; 1152db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11534f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1154db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1156db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1157076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1159db78de30SJunchao Zhang } 1160db78de30SJunchao Zhang 1161d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1162d71ae5a4SJacob Faibussowitsch { 1163db78de30SJunchao Zhang PetscFunctionBegin; 1164db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11654f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1166db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1168db78de30SJunchao Zhang } 1169db78de30SJunchao Zhang 1170d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1171d71ae5a4SJacob Faibussowitsch { 1172db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1173db78de30SJunchao Zhang 1174db78de30SJunchao Zhang PetscFunctionBegin; 1175db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11764f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1177db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1179db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1180076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1182db78de30SJunchao Zhang } 1183db78de30SJunchao Zhang 1184d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1185d71ae5a4SJacob Faibussowitsch { 1186db78de30SJunchao Zhang PetscFunctionBegin; 1187db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 11884f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1189db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11913ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1192db78de30SJunchao Zhang } 1193db78de30SJunchao Zhang 1194d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1195d71ae5a4SJacob Faibussowitsch { 1196db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1197db78de30SJunchao Zhang 1198db78de30SJunchao Zhang PetscFunctionBegin; 1199db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12004f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1201db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1202db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1203076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 12043ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1205db78de30SJunchao Zhang } 1206db78de30SJunchao Zhang 1207d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1208d71ae5a4SJacob Faibussowitsch { 1209db78de30SJunchao Zhang PetscFunctionBegin; 1210db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 12114f572ea9SToby Isaac PetscAssertPointer(kv, 2); 1212db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 12139566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 12143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1215db78de30SJunchao Zhang } 1216db78de30SJunchao Zhang 1217c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1219d71ae5a4SJacob Faibussowitsch { 1220a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1221c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1222c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1223c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1224*4df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1225a587d139SMark 1226a587d139SMark PetscFunctionBegin; 1227c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1228c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 12299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 12309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 12319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1232db78de30SJunchao Zhang 1233c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1234a587d139SMark PetscBool e; 12359566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1236a587d139SMark if (e) { 12379566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1238c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1239a587d139SMark } 1240a587d139SMark } 1241db78de30SJunchao Zhang 1242c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1243c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1244c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1245c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1246c17cf699SJunchao Zhang */ 1247c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1248c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1249c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1250c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1251c17cf699SJunchao Zhang 1252c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1253d326c3f1SJunchao Zhang KokkosBlas::axpy(exec, alpha, Xa, Ya); 12549566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1255c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1256c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1257c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1258c17cf699SJunchao Zhang 12599371c9d4SSatish Balay Kokkos::parallel_for( 1260d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 12610e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 12620e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 12630e3ece09SJunchao Zhang // Only one thread works in a team 1264c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 12650e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 12660e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 12670e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1268c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1269c17cf699SJunchao Zhang q++; 1270a587d139SMark } else { 12710e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 12720e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 12730e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 12740e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 12758b8b16f9SJunchao Zhang #else 12760e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 12778b8b16f9SJunchao Zhang #endif 1278a587d139SMark } 1279c17cf699SJunchao Zhang } 1280c17cf699SJunchao Zhang }); 1281c17cf699SJunchao Zhang }); 12829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 12830e3ece09SJunchao Zhang } else { // different nonzero patterns 1284c17cf699SJunchao Zhang Mat Z; 1285c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1286c17cf699SJunchao Zhang KernelHandle kh; 12870e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1288c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1289c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1290c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 12919566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 12929566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1293c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1294c17cf699SJunchao Zhang } 12959566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12960e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 12973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1298a587d139SMark } 1299a587d139SMark 13002c4ab24aSJunchao Zhang struct MatCOOStruct_SeqAIJKokkos { 13012c4ab24aSJunchao Zhang PetscCount n; 13022c4ab24aSJunchao Zhang PetscCount Atot; 13032c4ab24aSJunchao Zhang PetscInt nz; 13042c4ab24aSJunchao Zhang PetscCountKokkosView jmap; 13052c4ab24aSJunchao Zhang PetscCountKokkosView perm; 13062c4ab24aSJunchao Zhang 13072c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h) 13082c4ab24aSJunchao Zhang { 13092c4ab24aSJunchao Zhang nz = coo_h->nz; 13102c4ab24aSJunchao Zhang n = coo_h->n; 13112c4ab24aSJunchao Zhang Atot = coo_h->Atot; 13122c4ab24aSJunchao Zhang jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1)); 13132c4ab24aSJunchao Zhang perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot)); 13142c4ab24aSJunchao Zhang } 13152c4ab24aSJunchao Zhang }; 13162c4ab24aSJunchao Zhang 131749abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(void **data) 13182c4ab24aSJunchao Zhang { 13192c4ab24aSJunchao Zhang PetscFunctionBegin; 132049abdd8aSBarry Smith PetscCallCXX(delete static_cast<MatCOOStruct_SeqAIJKokkos *>(*data)); 13212c4ab24aSJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13222c4ab24aSJunchao Zhang } 13232c4ab24aSJunchao Zhang 1324d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1325d71ae5a4SJacob Faibussowitsch { 132642550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 132742550becSJunchao Zhang Mat_SeqAIJ *aseq; 132803e76207SPierre Jolivet PetscContainer container_h; 13292c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJ *coo_h; 13302c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo_d; 133142550becSJunchao Zhang 133242550becSJunchao Zhang PetscFunctionBegin; 13339566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1334394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 133542550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1336cbc6b225SStefano Zampini delete akok; 1337f4747e26SJunchao Zhang mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE); 13389566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 13392c4ab24aSJunchao Zhang 13402c4ab24aSJunchao Zhang // Copy the COO struct to device 13412c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h)); 13422c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h)); 13432c4ab24aSJunchao Zhang PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h)); 13442c4ab24aSJunchao Zhang 13452c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix 134603e76207SPierre Jolivet PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos)); 13473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 134842550becSJunchao Zhang } 134942550becSJunchao Zhang 1350d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1351d71ae5a4SJacob Faibussowitsch { 135242550becSJunchao Zhang MatScalarKokkosView Aa; 135342550becSJunchao Zhang ConstMatScalarKokkosView kv; 135442550becSJunchao Zhang PetscMemType memtype; 13552c4ab24aSJunchao Zhang PetscContainer container; 13562c4ab24aSJunchao Zhang MatCOOStruct_SeqAIJKokkos *coo; 135742550becSJunchao Zhang 135842550becSJunchao Zhang PetscFunctionBegin; 13592c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container)); 13602c4ab24aSJunchao Zhang PetscCall(PetscContainerGetPointer(container, (void **)&coo)); 13612c4ab24aSJunchao Zhang 13622c4ab24aSJunchao Zhang const auto &n = coo->n; 13632c4ab24aSJunchao Zhang const auto &Annz = coo->nz; 13642c4ab24aSJunchao Zhang const auto &jmap = coo->jmap; 13652c4ab24aSJunchao Zhang const auto &perm = coo->perm; 13662c4ab24aSJunchao Zhang 13679566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 136842550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 13692c4ab24aSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n)); 137042550becSJunchao Zhang } else { 13712c4ab24aSJunchao Zhang kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */ 137242550becSJunchao Zhang } 137342550becSJunchao Zhang 1374c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1375c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 137642550becSJunchao Zhang 137708bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin()); 13789371c9d4SSatish Balay Kokkos::parallel_for( 1379d326c3f1SJunchao Zhang Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) { 1380c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1381c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1382c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1383c7b718f4SJunchao Zhang }); 138408bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd()); 1385394ed5ebSJunchao Zhang 13869566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 13879566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 13883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138942550becSJunchao Zhang } 139042550becSJunchao Zhang 1391d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1392d71ae5a4SJacob Faibussowitsch { 13938f7e8f9dSMark Adams PetscFunctionBegin; 13949566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 13959566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 13968f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 13973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13988f7e8f9dSMark Adams } 13998f7e8f9dSMark Adams 1400d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1401d71ae5a4SJacob Faibussowitsch { 1402076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1403076ba34aSJunchao Zhang 14048c3ff71bSJunchao Zhang PetscFunctionBegin; 1405076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 14066f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 14076f3d89d0SStefano Zampini 14088c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 14098c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 14108c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1411a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1412f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1413a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1414076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 14158c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 14168c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 14178c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 14188c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 14198c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 14208c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1421076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 14220ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1423152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1424f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1425f4747e26SJunchao Zhang A->ops->shift = MatShift_SeqAIJKokkos; 1426f4747e26SJunchao Zhang A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos; 1427f4747e26SJunchao Zhang A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos; 1428076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1429076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1430076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1431076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1432076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1433076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 14347ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 143542550becSJunchao Zhang 14369566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 14379566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 143857761e9aSJunchao Zhang #if defined(PETSC_HAVE_HYPRE) 143957761e9aSJunchao Zhang PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE)); 144057761e9aSJunchao Zhang #endif 14413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1442076ba34aSJunchao Zhang } 1443076ba34aSJunchao Zhang 14449d13fa56SJunchao Zhang /* 14459d13fa56SJunchao Zhang Extract the (prescribled) diagonal blocks of the matrix and then invert them 14469d13fa56SJunchao Zhang 14479d13fa56SJunchao Zhang Input Parameters: 14489d13fa56SJunchao Zhang + A - the MATSEQAIJKOKKOS matrix 14499d13fa56SJunchao Zhang . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i) 14509d13fa56SJunchao Zhang . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[] 14519d13fa56SJunchao Zhang . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i) 14529d13fa56SJunchao Zhang - work - a pre-allocated work buffer (as big as diagVal) for use by this routine 14539d13fa56SJunchao Zhang 14549d13fa56SJunchao Zhang Output Parameter: 14559d13fa56SJunchao Zhang . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order) 14569d13fa56SJunchao Zhang */ 14579d13fa56SJunchao Zhang PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal) 14589d13fa56SJunchao Zhang { 14599d13fa56SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 14609d13fa56SJunchao Zhang PetscInt nblocks = bs.extent(0) - 1; 14619d13fa56SJunchao Zhang 14629d13fa56SJunchao Zhang PetscFunctionBegin; 14639d13fa56SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device 14649d13fa56SJunchao Zhang 14659d13fa56SJunchao Zhang // Pull out the diagonal blocks of the matrix and then invert the blocks 14669d13fa56SJunchao Zhang auto Aa = akok->a_dual.view_device(); 14679d13fa56SJunchao Zhang auto Ai = akok->i_dual.view_device(); 14689d13fa56SJunchao Zhang auto Aj = akok->j_dual.view_device(); 14699d13fa56SJunchao Zhang auto Adiag = akok->diag_dual.view_device(); 14709d13fa56SJunchao Zhang // TODO: how to tune the team size? 147145402d8aSJunchao Zhang #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY) 14729d13fa56SJunchao Zhang auto ts = Kokkos::AUTO(); 14739d13fa56SJunchao Zhang #else 14749d13fa56SJunchao Zhang auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs 14759d13fa56SJunchao Zhang #endif 14769d13fa56SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 1477d326c3f1SJunchao Zhang Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) { 14789d13fa56SJunchao Zhang const PetscInt bid = teamMember.league_rank(); // block id 14799d13fa56SJunchao Zhang const PetscInt rstart = bs(bid); // this block starts from this row 14809d13fa56SJunchao Zhang const PetscInt m = bs(bid + 1) - bs(bid); // size of this block 14819d13fa56SJunchao Zhang const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order 14829d13fa56SJunchao Zhang const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m); 14839d13fa56SJunchao Zhang 14849d13fa56SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B 14859d13fa56SJunchao Zhang PetscInt i = rstart + r; // i-th row in A 14869d13fa56SJunchao Zhang 14879d13fa56SJunchao Zhang if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case) 14889d13fa56SJunchao Zhang PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row 14899d13fa56SJunchao Zhang 14909d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet 14919d13fa56SJunchao Zhang if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero 14929d13fa56SJunchao Zhang B(r, c) = 0.0; 14939d13fa56SJunchao Zhang } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column 14949d13fa56SJunchao Zhang B(r, c) = Aa(first + c); 14959d13fa56SJunchao Zhang } else { // this entry does not show up in the CSR 14969d13fa56SJunchao Zhang B(r, c) = 0.0; 14979d13fa56SJunchao Zhang } 14989d13fa56SJunchao Zhang } 14999d13fa56SJunchao Zhang } else { // rare case that the diagonal does not exist 15009d13fa56SJunchao Zhang const PetscInt begin = Ai(i); 15019d13fa56SJunchao Zhang const PetscInt end = Ai(i + 1); 15029d13fa56SJunchao Zhang for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0; 15039d13fa56SJunchao Zhang for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not. 15049d13fa56SJunchao Zhang if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j); 15059d13fa56SJunchao Zhang else if (Aj(j) >= rstart + m) break; 15069d13fa56SJunchao Zhang } 15079d13fa56SJunchao Zhang } 15089d13fa56SJunchao Zhang }); 15099d13fa56SJunchao Zhang 15109d13fa56SJunchao Zhang // LU-decompose B (w/o pivoting) and then invert B 15119d13fa56SJunchao Zhang KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0); 15129d13fa56SJunchao Zhang KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W); 15139d13fa56SJunchao Zhang })); 15149d13fa56SJunchao Zhang // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels 15159d13fa56SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15169d13fa56SJunchao Zhang } 15179d13fa56SJunchao Zhang 1518d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1519d71ae5a4SJacob Faibussowitsch { 1520076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1521076ba34aSJunchao Zhang PetscInt i, m, n; 1522*4df4a32cSJunchao Zhang auto exec = PetscGetKokkosExecutionSpace(); 1523076ba34aSJunchao Zhang 1524076ba34aSJunchao Zhang PetscFunctionBegin; 15255f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1526076ba34aSJunchao Zhang 1527076ba34aSJunchao Zhang m = akok->nrows(); 1528076ba34aSJunchao Zhang n = akok->ncols(); 15299566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 15309566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1531076ba34aSJunchao Zhang 1532076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 15339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 153457508eceSPierre Jolivet aseq = (Mat_SeqAIJ *)A->data; 1535076ba34aSJunchao Zhang 1536e36ced11SJunchao Zhang PetscCallCXX(akok->i_dual.sync_host(exec)); /* We always need sync'ed i, j on host */ 1537e36ced11SJunchao Zhang PetscCallCXX(akok->j_dual.sync_host(exec)); 1538e36ced11SJunchao Zhang PetscCallCXX(exec.fence()); 1539076ba34aSJunchao Zhang 1540076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1541076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1542076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1543076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1544076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1545076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1546076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1547076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1548076ba34aSJunchao Zhang 15499566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 15509566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1551ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1552076ba34aSJunchao Zhang 1553076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1554076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1555ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 15569566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 15579566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 15583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1559076ba34aSJunchao Zhang } 1560076ba34aSJunchao Zhang 15610e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 15620e3ece09SJunchao Zhang { 15630e3ece09SJunchao Zhang PetscFunctionBegin; 15640e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 15650e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 15660e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15670e3ece09SJunchao Zhang } 15680e3ece09SJunchao Zhang 15690e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 15700e3ece09SJunchao Zhang { 15710e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 15724d86920dSPierre Jolivet 15730e3ece09SJunchao Zhang PetscFunctionBegin; 15740e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 15750e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 15760e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 15770e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 15780e3ece09SJunchao Zhang } 15790e3ece09SJunchao Zhang 1580076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1581076ba34aSJunchao Zhang 1582076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1583076ba34aSJunchao Zhang */ 1584d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1585d71ae5a4SJacob Faibussowitsch { 1586076ba34aSJunchao Zhang PetscFunctionBegin; 15879566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 15889566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 15893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15908c3ff71bSJunchao Zhang } 15918c3ff71bSJunchao Zhang 1592152b3e56SJunchao Zhang /*@C 159311a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 15948c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 159520f4b53cSBarry Smith Kokkos for calculations. 15968c3ff71bSJunchao Zhang 15978c3ff71bSJunchao Zhang Collective 15988c3ff71bSJunchao Zhang 15998c3ff71bSJunchao Zhang Input Parameters: 160011a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 16018c3ff71bSJunchao Zhang . m - number of rows 16028c3ff71bSJunchao Zhang . n - number of columns 160320f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 160420f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 16058c3ff71bSJunchao Zhang 16068c3ff71bSJunchao Zhang Output Parameter: 16078c3ff71bSJunchao Zhang . A - the matrix 16088c3ff71bSJunchao Zhang 16092ef1f0ffSBarry Smith Level: intermediate 16102ef1f0ffSBarry Smith 16112ef1f0ffSBarry Smith Notes: 161211a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 16138c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 161411a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 16158c3ff71bSJunchao Zhang 161611a5261eSBarry Smith The AIJ format, also called 16172ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 16188c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 161920f4b53cSBarry Smith either one (as in Fortran) or zero. 16208c3ff71bSJunchao Zhang 16212ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 16222ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 16232ef1f0ffSBarry Smith allocation. 16248c3ff71bSJunchao Zhang 1625fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()` 16268c3ff71bSJunchao Zhang @*/ 1627d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1628d71ae5a4SJacob Faibussowitsch { 16298c3ff71bSJunchao Zhang PetscFunctionBegin; 16309566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 16319566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 16329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 16339566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 16349566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 16353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 16368c3ff71bSJunchao Zhang } 1637930e68a5SMark Adams 1638d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1639d71ae5a4SJacob Faibussowitsch { 1640930e68a5SMark Adams PetscFunctionBegin; 16419566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 164286a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 16433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 164486a27549SJunchao Zhang } 164586a27549SJunchao Zhang 1646d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1647d71ae5a4SJacob Faibussowitsch { 164886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 164986a27549SJunchao Zhang 165086a27549SJunchao Zhang PetscFunctionBegin; 165186a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 165286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 165386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 165486a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 165586a27549SJunchao Zhang } 16563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 165786a27549SJunchao Zhang } 165886a27549SJunchao Zhang 165986a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1660d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1661d71ae5a4SJacob Faibussowitsch { 166286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1663076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 166486a27549SJunchao Zhang 166586a27549SJunchao Zhang PetscFunctionBegin; 166686a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 166786a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 16687b8d4ba6SJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires 0 16697b8d4ba6SJunchao Zhang factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0)); 16707b8d4ba6SJunchao Zhang factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0)); 167186a27549SJunchao Zhang 16729371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 167386a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 167486a27549SJunchao Zhang 167586a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 167686a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 167786a27549SJunchao Zhang */ 16789371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 167986a27549SJunchao Zhang 168086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 168186a27549SJunchao Zhang 168286a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 16837b8d4ba6SJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires 0 16847b8d4ba6SJunchao Zhang factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0)); 16857b8d4ba6SJunchao Zhang factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0)); 168686a27549SJunchao Zhang 16879371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 168886a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 168986a27549SJunchao Zhang 169086a27549SJunchao Zhang /* Sort indices. See comments above */ 16919371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 169286a27549SJunchao Zhang 169386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 169486a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 169586a27549SJunchao Zhang } 16963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 169786a27549SJunchao Zhang } 169886a27549SJunchao Zhang 169986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1700d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1701d71ae5a4SJacob Faibussowitsch { 170286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 170386a27549SJunchao Zhang PetscScalarKokkosView xv; 170486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 170586a27549SJunchao Zhang 170686a27549SJunchao Zhang PetscFunctionBegin; 17079566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17089566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 17099566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17109566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 171186a27549SJunchao Zhang /* Solve L tmpv = b */ 17129566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 171386a27549SJunchao Zhang /* Solve Ux = tmpv */ 17149566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 17159566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17169566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 171986a27549SJunchao Zhang } 172086a27549SJunchao Zhang 1721076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1722d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1723d71ae5a4SJacob Faibussowitsch { 172486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 172586a27549SJunchao Zhang PetscScalarKokkosView xv; 172686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 172786a27549SJunchao Zhang 172886a27549SJunchao Zhang PetscFunctionBegin; 17299566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 17319566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17329566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 173386a27549SJunchao Zhang /* Solve U^T tmpv = b */ 173486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 173586a27549SJunchao Zhang 173686a27549SJunchao Zhang /* Solve L^T x = tmpv */ 173786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 17389566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17399566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17409566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 174286a27549SJunchao Zhang } 174386a27549SJunchao Zhang 1744d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1745d71ae5a4SJacob Faibussowitsch { 174686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 174786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 174886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 174986a27549SJunchao Zhang 175086a27549SJunchao Zhang PetscFunctionBegin; 17519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17529566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1753076ba34aSJunchao Zhang 1754076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1755076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1756076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1757076ba34aSJunchao Zhang 1758076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 175986a27549SJunchao Zhang 176086a27549SJunchao Zhang B->assembled = PETSC_TRUE; 176186a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 176286a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 176386a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 176486a27549SJunchao Zhang B->ops->matsolve = NULL; 176586a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 176686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 176786a27549SJunchao Zhang 176886a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 176986a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 177086a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1771eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 17729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 177486a27549SJunchao Zhang } 177586a27549SJunchao Zhang 1776d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1777d71ae5a4SJacob Faibussowitsch { 177886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 177986a27549SJunchao Zhang Mat_SeqAIJ *b; 178086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 178186a27549SJunchao Zhang PetscInt fill_lev = info->levels; 178286a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 178386a27549SJunchao Zhang PetscInt n = A->rmap->n; 178486a27549SJunchao Zhang 178586a27549SJunchao Zhang PetscFunctionBegin; 17869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 178786a27549SJunchao Zhang /* Rebuild factors */ 17889371c9d4SSatish Balay if (factors) { 17899371c9d4SSatish Balay factors->Destroy(); 17909371c9d4SSatish Balay } /* Destroy the old if it exists */ 17919371c9d4SSatish Balay else { 17929371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 17939371c9d4SSatish Balay } 179486a27549SJunchao Zhang 179586a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 179686a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 179786a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 179886a27549SJunchao Zhang 179986a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 180086a27549SJunchao Zhang 180186a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 180286a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 180386a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 180486a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 180586a27549SJunchao Zhang 180686a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1807076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1808076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1809076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 181086a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 181186a27549SJunchao Zhang 181286a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 181386a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 181486a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 181586a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 181686a27549SJunchao Zhang 181786a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 181886a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 181986a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 182086a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 182186a27549SJunchao Zhang #else 182286a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 182386a27549SJunchao Zhang #endif 182486a27549SJunchao Zhang 182586a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 182686a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 182786a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 182886a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 182986a27549SJunchao Zhang 183086a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 18319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 183286a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 183386a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 183486a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1835a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 183686a27549SJunchao Zhang 183786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 183886a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 18393ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1840930e68a5SMark Adams } 1841930e68a5SMark Adams 1842d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1843d71ae5a4SJacob Faibussowitsch { 1844930e68a5SMark Adams PetscFunctionBegin; 1845930e68a5SMark Adams *type = MATSOLVERKOKKOS; 18463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1847930e68a5SMark Adams } 1848930e68a5SMark Adams 1849930e68a5SMark Adams /*MC 185086a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 185111a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1852930e68a5SMark Adams 1853930e68a5SMark Adams Level: beginner 1854930e68a5SMark Adams 18551cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1856930e68a5SMark Adams M*/ 185786a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1858930e68a5SMark Adams { 1859930e68a5SMark Adams PetscInt n = A->rmap->n; 1860930e68a5SMark Adams 1861930e68a5SMark Adams PetscFunctionBegin; 18629566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 18639566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1864930e68a5SMark Adams (*B)->factortype = ftype; 18659566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 18669566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1867930e68a5SMark Adams 18688f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18699566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 187086a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 187186a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 187286a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 18739566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 187486a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 187586a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 187698921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1877930e68a5SMark Adams 18789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 1879f4f49eeaSPierre Jolivet PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 18803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1881930e68a5SMark Adams } 18828f7e8f9dSMark Adams 1883d1f0640dSPierre Jolivet PETSC_INTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1884d71ae5a4SJacob Faibussowitsch { 188586a27549SJunchao Zhang PetscFunctionBegin; 18869566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18879566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 188986a27549SJunchao Zhang } 189086a27549SJunchao Zhang 1891076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1892d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1893d71ae5a4SJacob Faibussowitsch { 189445402d8aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map); 189545402d8aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries); 189645402d8aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values); 1897076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1898076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1899076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1900076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1901076ba34aSJunchao Zhang 1902076ba34aSJunchao Zhang PetscFunctionBegin; 19039566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1904076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 19059566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 190648a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 19079566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1908076ba34aSJunchao Zhang } 19093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1910076ba34aSJunchao Zhang } 1911