111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 188c3ff71bSJunchao Zhang 190e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0) 20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 23f98996d3SJunchao Zhang #else 24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 27f98996d3SJunchao Zhang #endif 28f98996d3SJunchao Zhang 298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 308c3ff71bSJunchao Zhang 31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 34076ba34aSJunchao Zhang */ 35d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36d71ae5a4SJacob Faibussowitsch { 37076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 38076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 398c3ff71bSJunchao Zhang 408c3ff71bSJunchao Zhang PetscFunctionBegin; 413ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 429566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43076ba34aSJunchao Zhang 44076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46076ba34aSJunchao Zhang 47076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 48076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 50076ba34aSJunchao Zhang */ 51076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52076ba34aSJunchao Zhang delete aijkok; 53076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54076ba34aSJunchao Zhang A->spptr = aijkok; 55076ba34aSJunchao Zhang } 56076ba34aSJunchao Zhang 575519a089SJose E. Roman if (aijkok->device_mat_d.data()) { 58a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59a587d139SMark } 603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 618c3ff71bSJunchao Zhang } 628c3ff71bSJunchao Zhang 6386a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 64d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65d71ae5a4SJacob Faibussowitsch { 668c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 678c3ff71bSJunchao Zhang 688c3ff71bSJunchao Zhang PetscFunctionBegin; 69aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device"); 705f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 72076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 73580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7486a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 758c3ff71bSJunchao Zhang } 763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 778c3ff71bSJunchao Zhang } 788c3ff71bSJunchao Zhang 79076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 80d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81d71ae5a4SJacob Faibussowitsch { 8286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8386a27549SJunchao Zhang 8486a27549SJunchao Zhang PetscFunctionBegin; 855f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8686a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8786a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8886a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8986a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 919566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9386a27549SJunchao Zhang } 9486a27549SJunchao Zhang 95d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96d71ae5a4SJacob Faibussowitsch { 97f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini PetscFunctionBegin; 100f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10186a27549SJunchao Zhang /* We do not expect one needs factors on host */ 102aaa8cc7dSPierre Jolivet PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host"); 1035f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 1053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 106f0cf5187SStefano Zampini } 107f0cf5187SStefano Zampini 108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109d71ae5a4SJacob Faibussowitsch { 110076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111f0cf5187SStefano Zampini 112f0cf5187SStefano Zampini PetscFunctionBegin; 1135519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1145519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1155519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1165519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1175519a089SJose E. Roman */ 1185519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 120076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 121076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 122076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123076ba34aSJunchao Zhang } 1243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 125076ba34aSJunchao Zhang } 126076ba34aSJunchao Zhang 127d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128d71ae5a4SJacob Faibussowitsch { 129076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130076ba34aSJunchao Zhang 131076ba34aSJunchao Zhang PetscFunctionBegin; 1325519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 134076ba34aSJunchao Zhang } 135076ba34aSJunchao Zhang 136d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137d71ae5a4SJacob Faibussowitsch { 138076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139076ba34aSJunchao Zhang 140076ba34aSJunchao Zhang PetscFunctionBegin; 1415519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 143076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1442328674fSJunchao Zhang } else { 1452328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1462328674fSJunchao Zhang } 1473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148076ba34aSJunchao Zhang } 149076ba34aSJunchao Zhang 150d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151d71ae5a4SJacob Faibussowitsch { 152076ba34aSJunchao Zhang PetscFunctionBegin; 153076ba34aSJunchao Zhang *array = NULL; 1543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 155076ba34aSJunchao Zhang } 156076ba34aSJunchao Zhang 157d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158d71ae5a4SJacob Faibussowitsch { 159076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160076ba34aSJunchao Zhang 161076ba34aSJunchao Zhang PetscFunctionBegin; 1625519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1642328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1652328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1662328674fSJunchao Zhang } 1673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168076ba34aSJunchao Zhang } 169076ba34aSJunchao Zhang 170d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171d71ae5a4SJacob Faibussowitsch { 172076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173076ba34aSJunchao Zhang 174076ba34aSJunchao Zhang PetscFunctionBegin; 1755519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 177076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1782328674fSJunchao Zhang } 1793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 180f0cf5187SStefano Zampini } 181f0cf5187SStefano Zampini 182d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183d71ae5a4SJacob Faibussowitsch { 1847ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1857ee59b9bSJunchao Zhang 1867ee59b9bSJunchao Zhang PetscFunctionBegin; 1877ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1887ee59b9bSJunchao Zhang 1897ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1907ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1917ee59b9bSJunchao Zhang if (a) { 1927ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1937ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1947ee59b9bSJunchao Zhang } 1957ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1977ee59b9bSJunchao Zhang } 1987ee59b9bSJunchao Zhang 199a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201d71ae5a4SJacob Faibussowitsch { 202042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204a587d139SMark 205a587d139SMark PetscFunctionBegin; 2065f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208a587d139SMark Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 2093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 210a587d139SMark } 211a587d139SMark 212a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214d71ae5a4SJacob Faibussowitsch { 215042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216a587d139SMark 217a587d139SMark PetscFunctionBegin; 218a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 219a587d139SMark *d_mat = aijkok->device_mat_d.data(); 220a587d139SMark } else { 2219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222a587d139SMark *d_mat = NULL; 223a587d139SMark } 2243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 225a587d139SMark } 226076ba34aSJunchao Zhang 2270e3ece09SJunchao Zhang /* 2280e3ece09SJunchao Zhang Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device. 2290e3ece09SJunchao Zhang 2300e3ece09SJunchao Zhang Input Parameter: 2310e3ece09SJunchao Zhang . A - the MATSEQAIJKOKKOS matrix 2320e3ece09SJunchao Zhang 2330e3ece09SJunchao Zhang Output Parameters: 2340e3ece09SJunchao Zhang + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i)) 235aaa8cc7dSPierre Jolivet - T_d - the transpose on device, whose value array is allocated but not initialized 2360e3ece09SJunchao Zhang */ 2370e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d) 238d71ae5a4SJacob Faibussowitsch { 2390e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2400e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2410e3ece09SJunchao Zhang const PetscInt *Ai = aseq->i, *Aj = aseq->j; 2420e3ece09SJunchao Zhang MatRowMapKokkosViewHost Ti_h("Ti", n + 1); 2430e3ece09SJunchao Zhang MatRowMapType *Ti = Ti_h.data(); 2440e3ece09SJunchao Zhang MatColIdxKokkosViewHost Tj_h("Tj", nz); 2450e3ece09SJunchao Zhang MatRowMapKokkosViewHost perm_h("permutation", nz); 2460e3ece09SJunchao Zhang PetscInt *Tj = Tj_h.data(); 2470e3ece09SJunchao Zhang PetscInt *perm = perm_h.data(); 2480e3ece09SJunchao Zhang PetscInt *offset; 249152b3e56SJunchao Zhang 250152b3e56SJunchao Zhang PetscFunctionBegin; 2510e3ece09SJunchao Zhang // Populate Ti 2520e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(Ti_h, 0)); 2530e3ece09SJunchao Zhang Ti++; 2540e3ece09SJunchao Zhang for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++; 2550e3ece09SJunchao Zhang Ti--; 2560e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i]; 2570e3ece09SJunchao Zhang 2580e3ece09SJunchao Zhang // Populate Tj and the permutation array 2590e3ece09SJunchao Zhang PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices 2600e3ece09SJunchao Zhang for (PetscInt i = 0; i < m; i++) { 2610e3ece09SJunchao Zhang for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i) 2620e3ece09SJunchao Zhang PetscInt r = Aj[j]; // row r of T 2630e3ece09SJunchao Zhang PetscInt disp = Ti[r] + offset[r]; 2640e3ece09SJunchao Zhang 2650e3ece09SJunchao Zhang Tj[disp] = i; // col i of T 2660e3ece09SJunchao Zhang perm[disp] = j; 2670e3ece09SJunchao Zhang offset[r]++; 268076ba34aSJunchao Zhang } 2690e3ece09SJunchao Zhang } 2700e3ece09SJunchao Zhang PetscCall(PetscFree(offset)); 2710e3ece09SJunchao Zhang 2720e3ece09SJunchao Zhang // Sort each row of T, along with the permutation array 2730e3ece09SJunchao Zhang for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i])); 2740e3ece09SJunchao Zhang 2750e3ece09SJunchao Zhang // Output perm and T on device 2760e3ece09SJunchao Zhang auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h); 2770e3ece09SJunchao Zhang auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h); 2780e3ece09SJunchao Zhang PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d)); 2790e3ece09SJunchao Zhang PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h)); 2803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 281152b3e56SJunchao Zhang } 282152b3e56SJunchao Zhang 2830e3ece09SJunchao Zhang // Generate the transpose on device and cache it internally 2840e3ece09SJunchao Zhang // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own 2850e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT) 286d71ae5a4SJacob Faibussowitsch { 2870e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 2880e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2890e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 2900e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatT; 291152b3e56SJunchao Zhang 292152b3e56SJunchao Zhang PetscFunctionBegin; 2930e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 2940e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's valeus since we are going to access them on device 2950e3ece09SJunchao Zhang 2960e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 2970e3ece09SJunchao Zhang 2980e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE) { 2990e3ece09SJunchao Zhang *csrmatT = akok->csrmat; 3000e3ece09SJunchao Zhang } else { 3010e3ece09SJunchao Zhang // See if we already have a cached transpose and its value is up to date 3020e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3030e3ece09SJunchao Zhang if (!akok->transpose_updated) { // if the value is out of date, update the cached version 3040e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3050e3ece09SJunchao Zhang auto &Ta = T.values; 3060e3ece09SJunchao Zhang 3070e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3080e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); })); 309076ba34aSJunchao Zhang } 3100e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3110e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3120e3ece09SJunchao Zhang 3130e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3140e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 3150e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3160e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); })); 3170e3ece09SJunchao Zhang } 3180e3ece09SJunchao Zhang akok->transpose_updated = PETSC_TRUE; 3190e3ece09SJunchao Zhang *csrmatT = akok->csrmatT; 3200e3ece09SJunchao Zhang } 3210e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 3220e3ece09SJunchao Zhang } 3230e3ece09SJunchao Zhang 3240e3ece09SJunchao Zhang // Generate the Hermitian on device and cache it internally 3250e3ece09SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH) 3260e3ece09SJunchao Zhang { 3270e3ece09SJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 3280e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3290e3ece09SJunchao Zhang PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n; 3300e3ece09SJunchao Zhang KokkosCsrMatrix &T = akok->csrmatH; 3310e3ece09SJunchao Zhang 3320e3ece09SJunchao Zhang PetscFunctionBegin; 3330e3ece09SJunchao Zhang PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 3340e3ece09SJunchao Zhang PetscCallCXX(akok->a_dual.sync_device()); // Sync A's values since we are going to access them on device 3350e3ece09SJunchao Zhang 3360e3ece09SJunchao Zhang const auto &Aa = akok->a_dual.view_device(); 3370e3ece09SJunchao Zhang 3380e3ece09SJunchao Zhang if (A->hermitian == PETSC_BOOL3_TRUE) { 3390e3ece09SJunchao Zhang *csrmatH = akok->csrmat; 3400e3ece09SJunchao Zhang } else { 3410e3ece09SJunchao Zhang // See if we already have a cached hermitian and its value is up to date 3420e3ece09SJunchao Zhang if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction 3430e3ece09SJunchao Zhang if (!akok->hermitian_updated) { // if the value is out of date, update the cached version 3440e3ece09SJunchao Zhang const auto &perm = akok->transpose_perm; // get the permutation array 3450e3ece09SJunchao Zhang auto &Ta = T.values; 3460e3ece09SJunchao Zhang 3470e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3480e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); })); 3490e3ece09SJunchao Zhang } 3500e3ece09SJunchao Zhang } else { // Generate T of size n x m for the first time 3510e3ece09SJunchao Zhang MatRowMapKokkosView perm; 3520e3ece09SJunchao Zhang 3530e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T)); 3540e3ece09SJunchao Zhang akok->transpose_perm = perm; // cache the perm in this matrix for reuse 3550e3ece09SJunchao Zhang PetscCallCXX(Kokkos::parallel_for( 3560e3ece09SJunchao Zhang nz, KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); })); 3570e3ece09SJunchao Zhang } 3580e3ece09SJunchao Zhang akok->hermitian_updated = PETSC_TRUE; 3590e3ece09SJunchao Zhang *csrmatH = akok->csrmatH; 3600e3ece09SJunchao Zhang } 3613ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 362152b3e56SJunchao Zhang } 363a587d139SMark 3648c3ff71bSJunchao Zhang /* y = A x */ 365d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 366d71ae5a4SJacob Faibussowitsch { 3678c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 368152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 369152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3708c3ff71bSJunchao Zhang 3718c3ff71bSJunchao Zhang PetscFunctionBegin; 3729566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3749566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3759566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 3768c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3779d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 3789566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 380076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 3819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3829566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3848c3ff71bSJunchao Zhang } 3858c3ff71bSJunchao Zhang 3868c3ff71bSJunchao Zhang /* y = A^T x */ 387d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 388d71ae5a4SJacob Faibussowitsch { 3898c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 390152b3e56SJunchao Zhang const char *mode; 391152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 392152b3e56SJunchao Zhang PetscScalarKokkosView yv; 3930e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 3948c3ff71bSJunchao Zhang 3958c3ff71bSJunchao Zhang PetscFunctionBegin; 3969566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3979566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3989566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3999566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 400152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 402152b3e56SJunchao Zhang mode = "N"; 403152b3e56SJunchao Zhang } else { 404076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4050e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 406152b3e56SJunchao Zhang mode = "T"; 407152b3e56SJunchao Zhang } 4080e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 4099566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4109566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4110e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4148c3ff71bSJunchao Zhang } 4158c3ff71bSJunchao Zhang 4168c3ff71bSJunchao Zhang /* y = A^H x */ 417d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 418d71ae5a4SJacob Faibussowitsch { 4198c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 420152b3e56SJunchao Zhang const char *mode; 421152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 422152b3e56SJunchao Zhang PetscScalarKokkosView yv; 4230e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4248c3ff71bSJunchao Zhang 4258c3ff71bSJunchao Zhang PetscFunctionBegin; 4269566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4279566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4289566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4299566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 430152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 432152b3e56SJunchao Zhang mode = "N"; 433152b3e56SJunchao Zhang } else { 434076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4350e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 436152b3e56SJunchao Zhang mode = "C"; 437152b3e56SJunchao Zhang } 4380e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 4399566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4409566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 4410e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4429566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4433ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4448c3ff71bSJunchao Zhang } 4458c3ff71bSJunchao Zhang 4468c3ff71bSJunchao Zhang /* z = A x + y */ 447d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 448d71ae5a4SJacob Faibussowitsch { 4498c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 450152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 451152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4528c3ff71bSJunchao Zhang 4538c3ff71bSJunchao Zhang PetscFunctionBegin; 4549566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4559566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4569566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4579566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4589566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4598c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 4608c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4619d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 4629566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4639566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 4669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4688c3ff71bSJunchao Zhang } 4698c3ff71bSJunchao Zhang 4708c3ff71bSJunchao Zhang /* z = A^T x + y */ 471d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 472d71ae5a4SJacob Faibussowitsch { 4738c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 474152b3e56SJunchao Zhang const char *mode; 475152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 476152b3e56SJunchao Zhang PetscScalarKokkosView zv; 4770e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 4788c3ff71bSJunchao Zhang 4798c3ff71bSJunchao Zhang PetscFunctionBegin; 4809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4829566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4839566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4849566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4858c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 486152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4879566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 488152b3e56SJunchao Zhang mode = "N"; 489152b3e56SJunchao Zhang } else { 490076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 4910e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 492152b3e56SJunchao Zhang mode = "T"; 493152b3e56SJunchao Zhang } 4940e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4959566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4969566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4979566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4980e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 4999566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5003ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5018c3ff71bSJunchao Zhang } 5028c3ff71bSJunchao Zhang 5038c3ff71bSJunchao Zhang /* z = A^H x + y */ 504d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 505d71ae5a4SJacob Faibussowitsch { 5068c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 507152b3e56SJunchao Zhang const char *mode; 508152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 509152b3e56SJunchao Zhang PetscScalarKokkosView zv; 5100e3ece09SJunchao Zhang KokkosCsrMatrix csrmat; 5118c3ff71bSJunchao Zhang 5128c3ff71bSJunchao Zhang PetscFunctionBegin; 5139566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 5149566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 5159566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 5169566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 5179566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 5188c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 519152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 5209566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 521152b3e56SJunchao Zhang mode = "N"; 522152b3e56SJunchao Zhang } else { 523076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5240e3ece09SJunchao Zhang csrmat = aijkok->csrmat; 525152b3e56SJunchao Zhang mode = "C"; 526152b3e56SJunchao Zhang } 5270e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 5289566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 5299566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 5309566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 5310e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz())); 5329566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 5333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 534152b3e56SJunchao Zhang } 535152b3e56SJunchao Zhang 536d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 537d71ae5a4SJacob Faibussowitsch { 538152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 539152b3e56SJunchao Zhang 540152b3e56SJunchao Zhang PetscFunctionBegin; 541152b3e56SJunchao Zhang switch (op) { 542152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 543152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 5449566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 545152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 546152b3e56SJunchao Zhang break; 547d71ae5a4SJacob Faibussowitsch default: 548d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 549d71ae5a4SJacob Faibussowitsch break; 550152b3e56SJunchao Zhang } 5513ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5528c3ff71bSJunchao Zhang } 5538c3ff71bSJunchao Zhang 554076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 555d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 556d71ae5a4SJacob Faibussowitsch { 557076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 5588c3ff71bSJunchao Zhang 5598c3ff71bSJunchao Zhang PetscFunctionBegin; 5609566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 561076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 5629566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 5638c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 5649566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 565076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 5665f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 5679566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 5689566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 5699566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 5709566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 571076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 572394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 5735f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 574076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 5758c3ff71bSJunchao Zhang } 576076ba34aSJunchao Zhang } 5773ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5788c3ff71bSJunchao Zhang } 5798c3ff71bSJunchao Zhang 580076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 581076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 582076ba34aSJunchao Zhang */ 583d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 584d71ae5a4SJacob Faibussowitsch { 585076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 586076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 587076ba34aSJunchao Zhang Mat mat; 5888c3ff71bSJunchao Zhang 5898c3ff71bSJunchao Zhang PetscFunctionBegin; 590076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 5919566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 592076ba34aSJunchao Zhang mat = *B; 593076ba34aSJunchao Zhang if (A->assembled) { 594076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 595076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 596076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 597076ba34aSJunchao Zhang /* Now copy values to B if needed */ 598076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 599076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 600076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 601076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 602076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 603076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 604076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 605076ba34aSJunchao Zhang } 606076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 607076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 608076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 609076ba34aSJunchao Zhang } 610076ba34aSJunchao Zhang mat->spptr = bkok; 611076ba34aSJunchao Zhang } 612076ba34aSJunchao Zhang 6139566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 6149566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 6159566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 6169566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 6173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6188c3ff71bSJunchao Zhang } 6198c3ff71bSJunchao Zhang 620d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 621d71ae5a4SJacob Faibussowitsch { 6220ecb592aSJunchao Zhang Mat At; 6230e3ece09SJunchao Zhang KokkosCsrMatrix internT; 6240ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 6250ecb592aSJunchao Zhang 6260ecb592aSJunchao Zhang PetscFunctionBegin; 6277fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 6289566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 6290ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 630ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 6310e3ece09SJunchao Zhang PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT))); 6329566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 6330ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 6349566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 6350ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 6360ecb592aSJunchao Zhang if ((*B)->assembled) { 6370ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 6380e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values)); 6399566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 6400ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 6410ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 6420e3ece09SJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */ 6430e3ece09SJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz()); 6440e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(a_h, internT.values)); 6450e3ece09SJunchao Zhang PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries)); 6460ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 6470ecb592aSJunchao Zhang } 6483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6490ecb592aSJunchao Zhang } 6500ecb592aSJunchao Zhang 651d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 652d71ae5a4SJacob Faibussowitsch { 65386a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 6548c3ff71bSJunchao Zhang 6558c3ff71bSJunchao Zhang PetscFunctionBegin; 65686a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 65786a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 6588c3ff71bSJunchao Zhang delete aijkok; 65986a27549SJunchao Zhang } else { 66086a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 66186a27549SJunchao Zhang } 662cbc6b225SStefano Zampini A->spptr = NULL; 6639566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 6649566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 6659566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 6669566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 6673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6688c3ff71bSJunchao Zhang } 6698c3ff71bSJunchao Zhang 6703f3ba80aSJunchao Zhang /*MC 6713f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 6723f3ba80aSJunchao Zhang 6733f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 6743f3ba80aSJunchao Zhang 6752ef1f0ffSBarry Smith Options Database Key: 67611a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 6773f3ba80aSJunchao Zhang 6783f3ba80aSJunchao Zhang Level: beginner 6793f3ba80aSJunchao Zhang 680*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 6813f3ba80aSJunchao Zhang M*/ 682d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 683d71ae5a4SJacob Faibussowitsch { 68486a27549SJunchao Zhang PetscFunctionBegin; 6859566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 6869566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 6879566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 6883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 68986a27549SJunchao Zhang } 69086a27549SJunchao Zhang 691076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 692d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 693d71ae5a4SJacob Faibussowitsch { 694076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 695076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 696076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 697076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 698076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 699076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 700a3f881fbSStefano Zampini 701a3f881fbSStefano Zampini PetscFunctionBegin; 702076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 703076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 704076ba34aSJunchao Zhang PetscValidPointer(C, 4); 705076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 706076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 7075f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 7085f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 709076ba34aSJunchao Zhang 7109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7119566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 712076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 713076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 714076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 715076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 716076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 717076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 718076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 719076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 720076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 721076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 722076ba34aSJunchao Zhang nnz = a->nz + b->nz; 723076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 724076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 725076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 726076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 727076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 728076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 729076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 730076ba34aSJunchao Zhang ca = ca_dual.view_device(); 731076ba34aSJunchao Zhang ci = ci_dual.view_device(); 732076ba34aSJunchao Zhang cj = cj_dual.view_device(); 733076ba34aSJunchao Zhang 734076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 7359371c9d4SSatish Balay Kokkos::parallel_for( 7369371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 737076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 738076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 739076ba34aSJunchao Zhang 740076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 741076ba34aSJunchao Zhang ci(i) = coffset; 742076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 743076ba34aSJunchao Zhang }); 744076ba34aSJunchao Zhang 745076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 746076ba34aSJunchao Zhang if (k < alen) { 747076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 748076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 749076ba34aSJunchao Zhang } else { 750076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 751076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 752076ba34aSJunchao Zhang } 753076ba34aSJunchao Zhang }); 754076ba34aSJunchao Zhang }); 755076ba34aSJunchao Zhang ca_dual.modify_device(); 756076ba34aSJunchao Zhang ci_dual.modify_device(); 757076ba34aSJunchao Zhang cj_dual.modify_device(); 7589566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 7599566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 760076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 761076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 762076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 763076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 764076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 765076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 766076ba34aSJunchao Zhang 7679371c9d4SSatish Balay Kokkos::parallel_for( 7689371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 769076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 770076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 771076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 772076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 773076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 774076ba34aSJunchao Zhang }); 775076ba34aSJunchao Zhang }); 7769566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 777076ba34aSJunchao Zhang } 7783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 779076ba34aSJunchao Zhang } 780076ba34aSJunchao Zhang 781d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 782d71ae5a4SJacob Faibussowitsch { 783076ba34aSJunchao Zhang PetscFunctionBegin; 784076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 7853ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 786a3f881fbSStefano Zampini } 787a3f881fbSStefano Zampini 788d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 789d71ae5a4SJacob Faibussowitsch { 790a3f881fbSStefano Zampini Mat_Product *product = C->product; 791a3f881fbSStefano Zampini Mat A, B; 792076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 793a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 794a3f881fbSStefano Zampini Mat_SeqAIJ *c; 795076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 7960e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB; 797a3f881fbSStefano Zampini 798a3f881fbSStefano Zampini PetscFunctionBegin; 799a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8005f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 801076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 802076ba34aSJunchao Zhang 8030e3ece09SJunchao Zhang // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)). 8040e3ece09SJunchao Zhang // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), 8050e3ece09SJunchao Zhang // we still do numeric. 8060e3ece09SJunchao Zhang if (pdata->reusesym) { // numeric reuses results from symbolic 8070e3ece09SJunchao Zhang pdata->reusesym = PETSC_FALSE; 8083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 809076ba34aSJunchao Zhang } 810076ba34aSJunchao Zhang 811076ba34aSJunchao Zhang switch (product->type) { 8129371c9d4SSatish Balay case MATPRODUCT_AB: 8139371c9d4SSatish Balay transA = false; 8149371c9d4SSatish Balay transB = false; 8159371c9d4SSatish Balay break; 8169371c9d4SSatish Balay case MATPRODUCT_AtB: 8179371c9d4SSatish Balay transA = true; 8189371c9d4SSatish Balay transB = false; 8199371c9d4SSatish Balay break; 8209371c9d4SSatish Balay case MATPRODUCT_ABt: 8219371c9d4SSatish Balay transA = false; 8229371c9d4SSatish Balay transB = true; 8239371c9d4SSatish Balay break; 824d71ae5a4SJacob Faibussowitsch default: 825d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 826076ba34aSJunchao Zhang } 827076ba34aSJunchao Zhang 828a3f881fbSStefano Zampini A = product->A; 829a3f881fbSStefano Zampini B = product->B; 8309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8319566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 832a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 833a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 834a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 835076ba34aSJunchao Zhang 8365f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 837076ba34aSJunchao Zhang 8380e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8390e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 840076ba34aSJunchao Zhang 841076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 842076ba34aSJunchao Zhang if (transA) { 8439566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 844076ba34aSJunchao Zhang transA = false; 845a3f881fbSStefano Zampini } 846a3f881fbSStefano Zampini 847076ba34aSJunchao Zhang if (transB) { 8489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 849076ba34aSJunchao Zhang transB = false; 850076ba34aSJunchao Zhang } 8519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 8520e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat)); 8530e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 854866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 855866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 856e944a159SJunchao Zhang #endif 857866eb059SJunchao Zhang 8589566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 8599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 860a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 861a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 8629566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 8639566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 8649566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 865a3f881fbSStefano Zampini c->reallocs = 0; 866076ba34aSJunchao Zhang C->info.mallocs = 0; 867a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 868a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 869a3f881fbSStefano Zampini C->num_ass++; 8703ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 871a3f881fbSStefano Zampini } 872a3f881fbSStefano Zampini 873d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 874d71ae5a4SJacob Faibussowitsch { 875076ba34aSJunchao Zhang Mat_Product *product = C->product; 876076ba34aSJunchao Zhang MatProductType ptype; 877076ba34aSJunchao Zhang Mat A, B; 878076ba34aSJunchao Zhang bool transA, transB; 879076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 880076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 881076ba34aSJunchao Zhang MPI_Comm comm; 8820e3ece09SJunchao Zhang KokkosCsrMatrix csrmatA, csrmatB, csrmatC; 883a3f881fbSStefano Zampini 884a3f881fbSStefano Zampini PetscFunctionBegin; 885a3f881fbSStefano Zampini MatCheckProduct(C, 1); 8869566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 8875f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 888a3f881fbSStefano Zampini A = product->A; 889a3f881fbSStefano Zampini B = product->B; 8909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 8919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 892a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 893a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 8940e3ece09SJunchao Zhang csrmatA = akok->csrmat; 8950e3ece09SJunchao Zhang csrmatB = bkok->csrmat; 896076ba34aSJunchao Zhang 897a3f881fbSStefano Zampini ptype = product->type; 8980e3ece09SJunchao Zhang // Take advantage of the symmetry if true 8990e3ece09SJunchao Zhang if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) { 9000e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9010e3ece09SJunchao Zhang product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE; 9020e3ece09SJunchao Zhang } 9030e3ece09SJunchao Zhang if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) { 9040e3ece09SJunchao Zhang ptype = MATPRODUCT_AB; 9050e3ece09SJunchao Zhang product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE; 9060e3ece09SJunchao Zhang } 9070e3ece09SJunchao Zhang 908a3f881fbSStefano Zampini switch (ptype) { 9099371c9d4SSatish Balay case MATPRODUCT_AB: 9109371c9d4SSatish Balay transA = false; 9119371c9d4SSatish Balay transB = false; 9129371c9d4SSatish Balay break; 9139371c9d4SSatish Balay case MATPRODUCT_AtB: 9149371c9d4SSatish Balay transA = true; 9159371c9d4SSatish Balay transB = false; 9169371c9d4SSatish Balay break; 9179371c9d4SSatish Balay case MATPRODUCT_ABt: 9189371c9d4SSatish Balay transA = false; 9199371c9d4SSatish Balay transB = true; 9209371c9d4SSatish Balay break; 921d71ae5a4SJacob Faibussowitsch default: 922d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 923a3f881fbSStefano Zampini } 9240e3ece09SJunchao Zhang PetscCallCXX(product->data = pdata = new MatProductData_SeqAIJKokkos()); 925076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 926a3f881fbSStefano Zampini 927076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 928866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 929866eb059SJunchao Zhang 930866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 931866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 932866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 933866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 934866eb059SJunchao Zhang #endif 935866eb059SJunchao Zhang #endif 9360e3ece09SJunchao Zhang PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg)); 937076ba34aSJunchao Zhang 9389566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 939076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 940076ba34aSJunchao Zhang if (transA) { 9419566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 942076ba34aSJunchao Zhang transA = false; 943076ba34aSJunchao Zhang } 944076ba34aSJunchao Zhang 945076ba34aSJunchao Zhang if (transB) { 9469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 947076ba34aSJunchao Zhang transB = false; 948076ba34aSJunchao Zhang } 949076ba34aSJunchao Zhang 9500e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 951076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 952076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 953076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 954076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 955076ba34aSJunchao Zhang */ 9560e3ece09SJunchao Zhang PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC)); 9570e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0) 958866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 959866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 960866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 961e944a159SJunchao Zhang #endif 9629566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 963076ba34aSJunchao Zhang 9649566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 9659566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 966076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 9673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 968a3f881fbSStefano Zampini } 969a3f881fbSStefano Zampini 970a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 971d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 972d71ae5a4SJacob Faibussowitsch { 973076ba34aSJunchao Zhang Mat_Product *product = mat->product; 974a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 975a3f881fbSStefano Zampini 976a3f881fbSStefano Zampini PetscFunctionBegin; 977a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 9789566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 97948a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 980a3f881fbSStefano Zampini if (Biskok && Ciskok) { 981a3f881fbSStefano Zampini switch (product->type) { 982a3f881fbSStefano Zampini case MATPRODUCT_AB: 983a3f881fbSStefano Zampini case MATPRODUCT_AtB: 984d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 985d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 986d71ae5a4SJacob Faibussowitsch break; 987a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 988a3f881fbSStefano Zampini case MATPRODUCT_RARt: 989d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 990d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 991d71ae5a4SJacob Faibussowitsch break; 992d71ae5a4SJacob Faibussowitsch default: 993d71ae5a4SJacob Faibussowitsch break; 994a3f881fbSStefano Zampini } 995a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 9969566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 997a3f881fbSStefano Zampini } 9983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 999a3f881fbSStefano Zampini } 1000a587d139SMark 1001d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 1002d71ae5a4SJacob Faibussowitsch { 1003f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 1004f0cf5187SStefano Zampini 1005f0cf5187SStefano Zampini PetscFunctionBegin; 10069566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 10079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1008f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1009076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 10109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10119566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 10129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 10133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1014f0cf5187SStefano Zampini } 1015f0cf5187SStefano Zampini 1016d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 1017d71ae5a4SJacob Faibussowitsch { 1018076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1019a587d139SMark 1020a587d139SMark PetscFunctionBegin; 1021076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 10222328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 1023076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 10249566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10252328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 10269566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 10272328674fSJunchao Zhang } 10283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1029a587d139SMark } 1030a587d139SMark 1031d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 1032d71ae5a4SJacob Faibussowitsch { 1033f78ce678SMark Adams Mat_SeqAIJ *aijseq; 1034f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 1035f78ce678SMark Adams PetscInt n; 1036f78ce678SMark Adams PetscScalarKokkosView xv; 1037f78ce678SMark Adams 1038f78ce678SMark Adams PetscFunctionBegin; 1039f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 1040f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 1041f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 1042f78ce678SMark Adams 1043f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1044f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1045f78ce678SMark Adams 1046f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 1047f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 1048f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ *>(A->data); 1049f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 1050f78ce678SMark Adams } 1051f78ce678SMark Adams 1052f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 1053f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 1054f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 1055f78ce678SMark Adams 1056f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 10579371c9d4SSatish Balay Kokkos::parallel_for( 10589371c9d4SSatish Balay n, KOKKOS_LAMBDA(const PetscInt i) { 1059f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 1060f78ce678SMark Adams else xv(i) = 0; 1061f78ce678SMark Adams }); 1062f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 10633ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1064f78ce678SMark Adams } 1065f78ce678SMark Adams 1066db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 1067d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1068d71ae5a4SJacob Faibussowitsch { 1069db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1070db78de30SJunchao Zhang 1071db78de30SJunchao Zhang PetscFunctionBegin; 1072db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1073db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1074db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1076db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1077076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1079db78de30SJunchao Zhang } 1080db78de30SJunchao Zhang 1081d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 1082d71ae5a4SJacob Faibussowitsch { 1083db78de30SJunchao Zhang PetscFunctionBegin; 1084db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1085db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1086db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10873ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1088db78de30SJunchao Zhang } 1089db78de30SJunchao Zhang 1090d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 1091d71ae5a4SJacob Faibussowitsch { 1092db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1093db78de30SJunchao Zhang 1094db78de30SJunchao Zhang PetscFunctionBegin; 1095db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1096db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1097db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1099db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1100076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1102db78de30SJunchao Zhang } 1103db78de30SJunchao Zhang 1104d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1105d71ae5a4SJacob Faibussowitsch { 1106db78de30SJunchao Zhang PetscFunctionBegin; 1107db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1108db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1109db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11109566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1112db78de30SJunchao Zhang } 1113db78de30SJunchao Zhang 1114d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1115d71ae5a4SJacob Faibussowitsch { 1116db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1117db78de30SJunchao Zhang 1118db78de30SJunchao Zhang PetscFunctionBegin; 1119db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1120db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1121db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1122db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1123076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 11243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1125db78de30SJunchao Zhang } 1126db78de30SJunchao Zhang 1127d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1128d71ae5a4SJacob Faibussowitsch { 1129db78de30SJunchao Zhang PetscFunctionBegin; 1130db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1131db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1132db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 11339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 11343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1135db78de30SJunchao Zhang } 1136db78de30SJunchao Zhang 1137c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1138d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1139d71ae5a4SJacob Faibussowitsch { 1140a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1141c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1142c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1143c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1144a587d139SMark 1145a587d139SMark PetscFunctionBegin; 1146c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1147c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 11489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 11499566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 11509566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1151db78de30SJunchao Zhang 1152c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1153a587d139SMark PetscBool e; 11549566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1155a587d139SMark if (e) { 11569566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1157c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1158a587d139SMark } 1159a587d139SMark } 1160db78de30SJunchao Zhang 1161c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1162c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1163c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1164c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1165c17cf699SJunchao Zhang */ 1166c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1167c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1168c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1169c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1170c17cf699SJunchao Zhang 1171c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1172c17cf699SJunchao Zhang KokkosBlas::axpy(alpha, Xa, Ya); 11739566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1174c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1175c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1176c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1177c17cf699SJunchao Zhang 11789371c9d4SSatish Balay Kokkos::parallel_for( 11799371c9d4SSatish Balay Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 11800e3ece09SJunchao Zhang PetscInt i = t.league_rank(); // row i 11810e3ece09SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { 11820e3ece09SJunchao Zhang // Only one thread works in a team 1183c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 11840e3ece09SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X, 11850e3ece09SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y. 11860e3ece09SJunchao Zhang if (Xj(p) == Yj(q)) { // Found it 1187c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1188c17cf699SJunchao Zhang q++; 1189a587d139SMark } else { 11900e3ece09SJunchao Zhang // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 11910e3ece09SJunchao Zhang // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 11920e3ece09SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0) 11930e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan(); 11948b8b16f9SJunchao Zhang #else 11950e3ece09SJunchao Zhang if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); 11968b8b16f9SJunchao Zhang #endif 1197a587d139SMark } 1198c17cf699SJunchao Zhang } 1199c17cf699SJunchao Zhang }); 1200c17cf699SJunchao Zhang }); 12019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 12020e3ece09SJunchao Zhang } else { // different nonzero patterns 1203c17cf699SJunchao Zhang Mat Z; 1204c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1205c17cf699SJunchao Zhang KernelHandle kh; 12060e3ece09SJunchao Zhang kh.create_spadd_handle(true); // X, Y are sorted 1207c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1208c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1209c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 12109566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 12119566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1212c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1213c17cf699SJunchao Zhang } 12149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 12150e3ece09SJunchao Zhang PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y 12163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1217a587d139SMark } 1218a587d139SMark 1219d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1220d71ae5a4SJacob Faibussowitsch { 122142550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 122242550becSJunchao Zhang Mat_SeqAIJ *aseq; 122342550becSJunchao Zhang 122442550becSJunchao Zhang PetscFunctionBegin; 12259566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1226394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 122742550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1228cbc6b225SStefano Zampini delete akok; 1229cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 12309566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1231394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 12323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 123342550becSJunchao Zhang } 123442550becSJunchao Zhang 1235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1236d71ae5a4SJacob Faibussowitsch { 123742550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 123842550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1239394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1240394ed5ebSJunchao Zhang const PetscCountKokkosView &jmap = akok->jmap_d; 1241394ed5ebSJunchao Zhang const PetscCountKokkosView &perm = akok->perm_d; 124242550becSJunchao Zhang MatScalarKokkosView Aa; 124342550becSJunchao Zhang ConstMatScalarKokkosView kv; 124442550becSJunchao Zhang PetscMemType memtype; 124542550becSJunchao Zhang 124642550becSJunchao Zhang PetscFunctionBegin; 12479566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 124842550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1249394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 125042550becSJunchao Zhang } else { 1251394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 125242550becSJunchao Zhang } 125342550becSJunchao Zhang 1254c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1255c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 125642550becSJunchao Zhang 12579371c9d4SSatish Balay Kokkos::parallel_for( 12589371c9d4SSatish Balay Annz, KOKKOS_LAMBDA(const PetscCount i) { 1259c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1260c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1261c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1262c7b718f4SJunchao Zhang }); 1263394ed5ebSJunchao Zhang 12649566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 12659566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 12663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 126742550becSJunchao Zhang } 126842550becSJunchao Zhang 1269d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1270d71ae5a4SJacob Faibussowitsch { 12715fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 12725fbaff96SJunchao Zhang MatScalarKokkosView Aa; 12735fbaff96SJunchao Zhang const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 12745fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 12755fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 12765fbaff96SJunchao Zhang 12775fbaff96SJunchao Zhang PetscFunctionBegin; 12785fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 12799371c9d4SSatish Balay Kokkos::parallel_for( 12809371c9d4SSatish Balay m, KOKKOS_LAMBDA(const PetscInt i) { 12815fbaff96SJunchao Zhang PetscScalar tmp; 12825fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 12835fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 12845fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 12855fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 12865fbaff96SJunchao Zhang } 12875fbaff96SJunchao Zhang }); 12885fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 12893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12905fbaff96SJunchao Zhang } 12915fbaff96SJunchao Zhang 1292d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1293d71ae5a4SJacob Faibussowitsch { 12948f7e8f9dSMark Adams PetscFunctionBegin; 12959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 12969566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 12978f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 12983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12998f7e8f9dSMark Adams } 13008f7e8f9dSMark Adams 1301d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1302d71ae5a4SJacob Faibussowitsch { 1303076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1304076ba34aSJunchao Zhang 13058c3ff71bSJunchao Zhang PetscFunctionBegin; 1306076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 13076f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 13086f3d89d0SStefano Zampini 13098c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 13108c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 13118c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1312a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1313f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1314a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1315076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 13168c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 13178c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 13188c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 13198c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 13208c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 13218c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1322076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 13230ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1324152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1325f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1326076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1327076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1328076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1329076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1330076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1331076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 13327ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 133342550becSJunchao Zhang 13349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 13359566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 13363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1337076ba34aSJunchao Zhang } 1338076ba34aSJunchao Zhang 1339d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1340d71ae5a4SJacob Faibussowitsch { 1341076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1342076ba34aSJunchao Zhang PetscInt i, m, n; 1343076ba34aSJunchao Zhang 1344076ba34aSJunchao Zhang PetscFunctionBegin; 13455f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1346076ba34aSJunchao Zhang 1347076ba34aSJunchao Zhang m = akok->nrows(); 1348076ba34aSJunchao Zhang n = akok->ncols(); 13499566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 13509566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1351076ba34aSJunchao Zhang 1352076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 13539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1354076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ *)(A)->data; 1355076ba34aSJunchao Zhang 1356076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1357076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1358076ba34aSJunchao Zhang 1359076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1360076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1361076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1362076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1363076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1364076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1365076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1366076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1367076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1368076ba34aSJunchao Zhang 13699566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 13709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1371ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1372076ba34aSJunchao Zhang 1373076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1374076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1375ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 13769566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 13779566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 13783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1379076ba34aSJunchao Zhang } 1380076ba34aSJunchao Zhang 13810e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr) 13820e3ece09SJunchao Zhang { 13830e3ece09SJunchao Zhang PetscFunctionBegin; 13840e3ece09SJunchao Zhang PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13850e3ece09SJunchao Zhang *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat; 13860e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13870e3ece09SJunchao Zhang } 13880e3ece09SJunchao Zhang 13890e3ece09SJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A) 13900e3ece09SJunchao Zhang { 13910e3ece09SJunchao Zhang Mat_SeqAIJKokkos *akok; 13920e3ece09SJunchao Zhang PetscFunctionBegin; 13930e3ece09SJunchao Zhang PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr)); 13940e3ece09SJunchao Zhang PetscCall(MatCreate(comm, A)); 13950e3ece09SJunchao Zhang PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 13960e3ece09SJunchao Zhang PetscFunctionReturn(PETSC_SUCCESS); 13970e3ece09SJunchao Zhang } 13980e3ece09SJunchao Zhang 1399076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1400076ba34aSJunchao Zhang 1401076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1402076ba34aSJunchao Zhang */ 1403d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1404d71ae5a4SJacob Faibussowitsch { 1405076ba34aSJunchao Zhang PetscFunctionBegin; 14069566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 14079566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 14083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14098c3ff71bSJunchao Zhang } 14108c3ff71bSJunchao Zhang 1411152b3e56SJunchao Zhang /*@C 141211a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 14138c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 141420f4b53cSBarry Smith Kokkos for calculations. 14158c3ff71bSJunchao Zhang 14168c3ff71bSJunchao Zhang Collective 14178c3ff71bSJunchao Zhang 14188c3ff71bSJunchao Zhang Input Parameters: 141911a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 14208c3ff71bSJunchao Zhang . m - number of rows 14218c3ff71bSJunchao Zhang . n - number of columns 142220f4b53cSBarry Smith . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided 142320f4b53cSBarry Smith - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL` 14248c3ff71bSJunchao Zhang 14258c3ff71bSJunchao Zhang Output Parameter: 14268c3ff71bSJunchao Zhang . A - the matrix 14278c3ff71bSJunchao Zhang 14282ef1f0ffSBarry Smith Level: intermediate 14292ef1f0ffSBarry Smith 14302ef1f0ffSBarry Smith Notes: 143111a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 14328c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 143311a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 14348c3ff71bSJunchao Zhang 143511a5261eSBarry Smith The AIJ format, also called 14362ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 14378c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 143820f4b53cSBarry Smith either one (as in Fortran) or zero. 14398c3ff71bSJunchao Zhang 14402ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 14412ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 14422ef1f0ffSBarry Smith allocation. 14438c3ff71bSJunchao Zhang 1444*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 14458c3ff71bSJunchao Zhang @*/ 1446d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1447d71ae5a4SJacob Faibussowitsch { 14488c3ff71bSJunchao Zhang PetscFunctionBegin; 14499566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 14509566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 14519566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 14529566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 14539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 14543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14558c3ff71bSJunchao Zhang } 1456930e68a5SMark Adams 14578f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 14588f7e8f9dSMark Adams // 145946804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 14608f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 14618f7e8f9dSMark Adams // 1462d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1463d71ae5a4SJacob Faibussowitsch { 14648f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 14658f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 14668f7e8f9dSMark Adams IS isrow = b->row, isicol = b->icol; 14678f7e8f9dSMark Adams const PetscInt *r_h, *ic_h; 1468300d22a6SJunchao Zhang const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1469076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1470076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 14718f7e8f9dSMark Adams PetscBool row_identity, col_identity; 147246804e07SMark Adams PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1473930e68a5SMark Adams 1474930e68a5SMark Adams PetscFunctionBegin; 14752c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1476b94d7dedSBarry Smith PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 14772c71b3e2SJacob Faibussowitsch PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 14789566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r_h)); 14799566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic_h)); 14809566063dSJacob Faibussowitsch PetscCall(ISGetSize(isicol, &nc)); 14819566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 14829566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 14838f7e8f9dSMark Adams { 14848f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 14858f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 14868f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 14878f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 14888f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 14898f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 14908f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 14918f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 14928f7e8f9dSMark Adams size_t flops_h = 0.0; 14938f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 14948f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k("flops"); 14958f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1496da81f932SPierre Jolivet const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 14978f7e8f9dSMark Adams Kokkos::deep_copy(d_flops_k, h_flops_k); 14988f7e8f9dSMark Adams Kokkos::deep_copy(d_r_k, h_r_k); 14998f7e8f9dSMark Adams Kokkos::deep_copy(d_ic_k, h_ic_k); 15008f7e8f9dSMark Adams // Fill A --> fact 15019371c9d4SSatish Balay Kokkos::parallel_for( 15029371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1503042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 15048f7e8f9dSMark Adams const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 15058f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 15068f7e8f9dSMark Adams // zero rows of B 15078f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 15088f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 15098f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 15108f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 15118f7e8f9dSMark Adams /* zero (unfactored row) */ 15128f7e8f9dSMark Adams for (int j = 0; j < nzbL; j++) baL[j] = 0; 15138f7e8f9dSMark Adams for (int j = 0; j < nzbU; j++) baU[j] = 0; 15148f7e8f9dSMark Adams }); 15158f7e8f9dSMark Adams // copy A into B 15168f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 15178f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 15188f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 15198f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 15208f7e8f9dSMark Adams /* load in initial (unfactored row) */ 15218f7e8f9dSMark Adams for (int j = 0; j < nza; j++) { 15228f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 15238f7e8f9dSMark Adams PetscScalar vala = av[j]; 15248f7e8f9dSMark Adams if (colb == rowb) { 15258f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 15268f7e8f9dSMark Adams } else { 15278f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 15288f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 15298f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 15308f7e8f9dSMark Adams for (int j = 0; j < nz; j++) { 15318f7e8f9dSMark Adams if (pbj[j] == colb) { 15328f7e8f9dSMark Adams pba[j] = vala; 15338f7e8f9dSMark Adams set++; 15348f7e8f9dSMark Adams break; 15358f7e8f9dSMark Adams } 15368f7e8f9dSMark Adams } 15378f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 15388f7e8f9dSMark Adams if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 153958d687f0SJunchao Zhang #else 154058d687f0SJunchao Zhang (void)set; 15418f1da0b2SJunchao Zhang #endif 15428f7e8f9dSMark Adams } 15438f7e8f9dSMark Adams } 15448f7e8f9dSMark Adams }); 15458f7e8f9dSMark Adams }); 15468f7e8f9dSMark Adams Kokkos::fence(); 1547930e68a5SMark Adams 15489371c9d4SSatish Balay Kokkos::parallel_for( 15499371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 15508f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 15518f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 15528f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1553042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 15548f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc; 15558f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 15568f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 15578f7e8f9dSMark Adams for (int ii = start; ii < end - 1; ii++) { 15588f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 15598f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 15608f7e8f9dSMark Adams const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 15618f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 15628f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 15638f7e8f9dSMark Adams PetscInt kIdx = j * Ni + field_block_idx; 15649371c9d4SSatish Balay if (kIdx >= nzUi) /* void */ 15659371c9d4SSatish Balay ; 15668f7e8f9dSMark Adams else { 15678f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 15688f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 15698f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 15708f7e8f9dSMark Adams size_t st_idx; 15718f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 15728f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 15738f7e8f9dSMark Adams // get column, there has got to be a better way 15749371c9d4SSatish Balay Kokkos::parallel_reduce( 15759371c9d4SSatish Balay Kokkos::ThreadVectorRange(team, nzL), 15769371c9d4SSatish Balay [&](const int &j, size_t &idx) { 15778f7e8f9dSMark Adams if (pjL[j] == ii) { 15788f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 15798f7e8f9dSMark Adams idx = j; // output 15808f7e8f9dSMark Adams *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 15818f7e8f9dSMark Adams } 15829371c9d4SSatish Balay }, 15839371c9d4SSatish Balay st_idx); 15849371c9d4SSatish Balay Kokkos::single(Kokkos::PerThread(team), [=]() { 15859371c9d4SSatish Balay colkIdx() = st_idx; 15869371c9d4SSatish Balay L_ki() = *(ba_d + bi_d[myk] + st_idx); 15879371c9d4SSatish Balay }); 15888f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 158999551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 159099551766SMark Adams #endif 159199551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 15928f7e8f9dSMark Adams // U(i+1,:end) 15938f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 15948f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 15958f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 15968f7e8f9dSMark Adams if (col == myk) { 15978f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 15988f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 15998f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 16008f7e8f9dSMark Adams } else { 16018f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv = NULL; 16028f7e8f9dSMark Adams PetscInt high, low; 16038f7e8f9dSMark Adams const PetscInt *startj; 16048f7e8f9dSMark Adams if (col < myk) { // L 16058f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 16068f7e8f9dSMark Adams PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 16078f7e8f9dSMark Adams start = pLki + 1; // start at pLki+1, A22(myk,1) 16088f7e8f9dSMark Adams startj = bj_d + bi_d[myk] + idx; 16098f7e8f9dSMark Adams end = ba_d + bi_d[myk + 1]; 16108f7e8f9dSMark Adams } else { 16118f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk + 1] + 1; 16128f7e8f9dSMark Adams start = ba_d + idx; 16138f7e8f9dSMark Adams startj = bj_d + idx; 16148f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 16158f7e8f9dSMark Adams } 16168f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 16178f7e8f9dSMark Adams low = 0; 16188f7e8f9dSMark Adams high = (PetscInt)(end - start); 16198f7e8f9dSMark Adams while (high - low > 5) { 16208f7e8f9dSMark Adams int t = (low + high) / 2; 16218f7e8f9dSMark Adams if (startj[t] > col) high = t; 16228f7e8f9dSMark Adams else low = t; 16238f7e8f9dSMark Adams } 16248f7e8f9dSMark Adams for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 16258f7e8f9dSMark Adams if (startj[pAkjv - start] == col) break; 16268f7e8f9dSMark Adams } 16278f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 162899551766SMark Adams if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 162999551766SMark Adams #endif 16308f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 16318f7e8f9dSMark Adams } 16328f7e8f9dSMark Adams }); 16338f7e8f9dSMark Adams } 16348f7e8f9dSMark Adams }); 16358f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 16368f7e8f9dSMark Adams if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 16378f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 16389371c9d4SSatish Balay Kokkos::single(Kokkos::PerTeam(team), [=]() { 16399371c9d4SSatish Balay Kokkos::atomic_add(&d_flops_k(), flops()); 16409371c9d4SSatish Balay flops() = 0; 16419371c9d4SSatish Balay }); 16428f7e8f9dSMark Adams }); 16438f7e8f9dSMark Adams Kokkos::fence(); 16448f7e8f9dSMark Adams Kokkos::deep_copy(h_flops_k, d_flops_k); 16459566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 16469371c9d4SSatish Balay Kokkos::parallel_for( 16479371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 16488f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 16498f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 16508f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 16518f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 16528f7e8f9dSMark Adams int i = start + outer_index * Ni + lg_rank % Ni; 16538f7e8f9dSMark Adams if (i < end) { 16548f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 16558f7e8f9dSMark Adams *pv = 1.0 / (*pv); 16568f7e8f9dSMark Adams } 16578f7e8f9dSMark Adams }); 16588f7e8f9dSMark Adams }); 16598f7e8f9dSMark Adams } 16609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16619566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic_h)); 16629566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r_h)); 16638f7e8f9dSMark Adams 16649566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 16659566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &col_identity)); 16668f7e8f9dSMark Adams if (b->inode.size) { 16678f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 16688f7e8f9dSMark Adams } else if (row_identity && col_identity) { 16698f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 16708f7e8f9dSMark Adams } else { 16718f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 16728f7e8f9dSMark Adams } 16738f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 16749566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 16758f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 16768f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 16778f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 16788f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 16798f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 16808f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 16818f7e8f9dSMark Adams 16823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1683930e68a5SMark Adams } 1684930e68a5SMark Adams 1685d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1686d71ae5a4SJacob Faibussowitsch { 1687930e68a5SMark Adams PetscFunctionBegin; 16889566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 168986a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 16903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 169186a27549SJunchao Zhang } 169286a27549SJunchao Zhang 1693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1694d71ae5a4SJacob Faibussowitsch { 169586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 169686a27549SJunchao Zhang 169786a27549SJunchao Zhang PetscFunctionBegin; 169886a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 169986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 170086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 170186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 170286a27549SJunchao Zhang } 17033ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 170486a27549SJunchao Zhang } 170586a27549SJunchao Zhang 170686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1707d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1708d71ae5a4SJacob Faibussowitsch { 170986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1710076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 171186a27549SJunchao Zhang 171286a27549SJunchao Zhang PetscFunctionBegin; 171386a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 171486a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1715076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 171686a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1717076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1718076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 171986a27549SJunchao Zhang 17209371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 172186a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 172286a27549SJunchao Zhang 172386a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 172486a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 172586a27549SJunchao Zhang */ 17269371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 172786a27549SJunchao Zhang 172886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 172986a27549SJunchao Zhang 173086a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1731076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 173286a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1733076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1734076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 173586a27549SJunchao Zhang 17369371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 173786a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 173886a27549SJunchao Zhang 173986a27549SJunchao Zhang /* Sort indices. See comments above */ 17409371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 174186a27549SJunchao Zhang 174286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 174386a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 174486a27549SJunchao Zhang } 17453ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 174686a27549SJunchao Zhang } 174786a27549SJunchao Zhang 174886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1749d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1750d71ae5a4SJacob Faibussowitsch { 175186a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 175286a27549SJunchao Zhang PetscScalarKokkosView xv; 175386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 175486a27549SJunchao Zhang 175586a27549SJunchao Zhang PetscFunctionBegin; 17569566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17579566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 17589566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17599566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 176086a27549SJunchao Zhang /* Solve L tmpv = b */ 17619566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 176286a27549SJunchao Zhang /* Solve Ux = tmpv */ 17639566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 17649566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17659566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17669566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 176886a27549SJunchao Zhang } 176986a27549SJunchao Zhang 1770076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1771d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1772d71ae5a4SJacob Faibussowitsch { 177386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 177486a27549SJunchao Zhang PetscScalarKokkosView xv; 177586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 177686a27549SJunchao Zhang 177786a27549SJunchao Zhang PetscFunctionBegin; 17789566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 17799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 17809566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 17819566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 178286a27549SJunchao Zhang /* Solve U^T tmpv = b */ 178386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 178486a27549SJunchao Zhang 178586a27549SJunchao Zhang /* Solve L^T x = tmpv */ 178686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 17879566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 17889566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 17899566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 179186a27549SJunchao Zhang } 179286a27549SJunchao Zhang 1793d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1794d71ae5a4SJacob Faibussowitsch { 179586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 179686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 179786a27549SJunchao Zhang PetscInt fill_lev = info->levels; 179886a27549SJunchao Zhang 179986a27549SJunchao Zhang PetscFunctionBegin; 18009566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 18019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1802076ba34aSJunchao Zhang 1803076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1804076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1805076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1806076ba34aSJunchao Zhang 1807076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 180886a27549SJunchao Zhang 180986a27549SJunchao Zhang B->assembled = PETSC_TRUE; 181086a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 181186a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 181286a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 181386a27549SJunchao Zhang B->ops->matsolve = NULL; 181486a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 181586a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 181686a27549SJunchao Zhang 181786a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 181886a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 181986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1820eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 18219566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 18223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 182386a27549SJunchao Zhang } 182486a27549SJunchao Zhang 1825d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1826d71ae5a4SJacob Faibussowitsch { 182786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 182886a27549SJunchao Zhang Mat_SeqAIJ *b; 182986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 183086a27549SJunchao Zhang PetscInt fill_lev = info->levels; 183186a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 183286a27549SJunchao Zhang PetscInt n = A->rmap->n; 183386a27549SJunchao Zhang 183486a27549SJunchao Zhang PetscFunctionBegin; 18359566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 183686a27549SJunchao Zhang /* Rebuild factors */ 18379371c9d4SSatish Balay if (factors) { 18389371c9d4SSatish Balay factors->Destroy(); 18399371c9d4SSatish Balay } /* Destroy the old if it exists */ 18409371c9d4SSatish Balay else { 18419371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 18429371c9d4SSatish Balay } 184386a27549SJunchao Zhang 184486a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 184586a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 184686a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 184786a27549SJunchao Zhang 184886a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 184986a27549SJunchao Zhang 185086a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 185186a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 185286a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 185386a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 185486a27549SJunchao Zhang 185586a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1856076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1857076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1858076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 185986a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 186086a27549SJunchao Zhang 186186a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 186286a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 186386a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 186486a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 186586a27549SJunchao Zhang 186686a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 186786a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 186886a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 186986a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 187086a27549SJunchao Zhang #else 187186a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 187286a27549SJunchao Zhang #endif 187386a27549SJunchao Zhang 187486a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 187586a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 187686a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 187786a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 187886a27549SJunchao Zhang 187986a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 18809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 188186a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 188286a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 188386a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 1884a1e4e190SStefano Zampini B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0; 188586a27549SJunchao Zhang 188686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 188786a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 18883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1889930e68a5SMark Adams } 1890930e68a5SMark Adams 1891d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1892d71ae5a4SJacob Faibussowitsch { 18938f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 18948f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1895930e68a5SMark Adams 18968f7e8f9dSMark Adams PetscFunctionBegin; 18979566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 18988f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 18998f7e8f9dSMark Adams // move B data into Kokkos 19009566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 19019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 19028f7e8f9dSMark Adams { 19038f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1904300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 19058f7e8f9dSMark Adams const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1906300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1907300d22a6SJunchao Zhang Kokkos::deep_copy(baijkok->diag_d, h_diag); 19088f7e8f9dSMark Adams } 19098f7e8f9dSMark Adams } 19103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19118f7e8f9dSMark Adams } 19128f7e8f9dSMark Adams 1913d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1914d71ae5a4SJacob Faibussowitsch { 1915930e68a5SMark Adams PetscFunctionBegin; 1916930e68a5SMark Adams *type = MATSOLVERKOKKOS; 19173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1918930e68a5SMark Adams } 1919930e68a5SMark Adams 1920d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1921d71ae5a4SJacob Faibussowitsch { 19228f7e8f9dSMark Adams PetscFunctionBegin; 19238f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 19243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19258f7e8f9dSMark Adams } 19268f7e8f9dSMark Adams 1927930e68a5SMark Adams /*MC 192886a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 192911a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1930930e68a5SMark Adams 1931930e68a5SMark Adams Level: beginner 1932930e68a5SMark Adams 1933*1cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1934930e68a5SMark Adams M*/ 193586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1936930e68a5SMark Adams { 1937930e68a5SMark Adams PetscInt n = A->rmap->n; 1938930e68a5SMark Adams 1939930e68a5SMark Adams PetscFunctionBegin; 19409566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 19419566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1942930e68a5SMark Adams (*B)->factortype = ftype; 19439566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 19449566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1945930e68a5SMark Adams 19468f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 19479566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 194886a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 194986a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 195086a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 19519566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 195286a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 195386a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 195498921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1955930e68a5SMark Adams 19569566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 19579566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 19583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1959930e68a5SMark Adams } 19608f7e8f9dSMark Adams 1961d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1962d71ae5a4SJacob Faibussowitsch { 19638f7e8f9dSMark Adams PetscInt n = A->rmap->n; 19648f7e8f9dSMark Adams 19658f7e8f9dSMark Adams PetscFunctionBegin; 19669566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 19679566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 19688f7e8f9dSMark Adams (*B)->factortype = ftype; 1969f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 19709566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 19719566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 19728f7e8f9dSMark Adams 19738f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 19749566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 19758f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 19768f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 19778f7e8f9dSMark Adams 19789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 19799566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 19803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 19818f7e8f9dSMark Adams } 198286a27549SJunchao Zhang 1983d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1984d71ae5a4SJacob Faibussowitsch { 198586a27549SJunchao Zhang PetscFunctionBegin; 19869566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 19879566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 19889566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 19893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 199086a27549SJunchao Zhang } 199186a27549SJunchao Zhang 1992076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1993d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1994d71ae5a4SJacob Faibussowitsch { 1995076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1996076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1997076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1998076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1999076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 2000076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 2001076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 2002076ba34aSJunchao Zhang 2003076ba34aSJunchao Zhang PetscFunctionBegin; 20049566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 2005076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 20069566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 200748a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 20089566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 2009076ba34aSJunchao Zhang } 20103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2011076ba34aSJunchao Zhang } 2012