111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 1742550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 188c3ff71bSJunchao Zhang 19f98996d3SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 6, 99) 20f98996d3SJunchao Zhang #include <KokkosSparse_Utils.hpp> 21f98996d3SJunchao Zhang using KokkosSparse::sort_crs_matrix; 229371c9d4SSatish Balay using KokkosSparse::Impl::transpose_matrix; 23f98996d3SJunchao Zhang #else 24f98996d3SJunchao Zhang #include <KokkosKernels_Sorting.hpp> 25f98996d3SJunchao Zhang using KokkosKernels::sort_crs_matrix; 269371c9d4SSatish Balay using KokkosKernels::Impl::transpose_matrix; 27f98996d3SJunchao Zhang #endif 28f98996d3SJunchao Zhang 298c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 308c3ff71bSJunchao Zhang 31076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 32076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 33076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 34076ba34aSJunchao Zhang */ 35d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode) 36d71ae5a4SJacob Faibussowitsch { 37076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 38076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 398c3ff71bSJunchao Zhang 408c3ff71bSJunchao Zhang PetscFunctionBegin; 413ba16761SJacob Faibussowitsch if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS); 429566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_SeqAIJ(A, mode)); 43076ba34aSJunchao Zhang 44076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ *>(A->data); 45076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 46076ba34aSJunchao Zhang 47076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 48076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 49076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 50076ba34aSJunchao Zhang */ 51076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 52076ba34aSJunchao Zhang delete aijkok; 53076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aijseq->nz, aijseq->i, aijseq->j, aijseq->a, A->nonzerostate, PETSC_FALSE /*don't copy mat values to device*/); 54076ba34aSJunchao Zhang A->spptr = aijkok; 55076ba34aSJunchao Zhang } 56076ba34aSJunchao Zhang 575519a089SJose E. Roman if (aijkok->device_mat_d.data()) { 58a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 59a587d139SMark } 603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 618c3ff71bSJunchao Zhang } 628c3ff71bSJunchao Zhang 6386a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 64d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 65d71ae5a4SJacob Faibussowitsch { 668c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 678c3ff71bSJunchao Zhang 688c3ff71bSJunchao Zhang PetscFunctionBegin; 695f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from host to device"); 705f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 71076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 72076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 73580c7c76SPierre Jolivet aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */ 7486a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 758c3ff71bSJunchao Zhang } 763ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 778c3ff71bSJunchao Zhang } 788c3ff71bSJunchao Zhang 79076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 80d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 81d71ae5a4SJacob Faibussowitsch { 8286a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 8386a27549SJunchao Zhang 8486a27549SJunchao Zhang PetscFunctionBegin; 855f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries"); 8686a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8786a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8886a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8986a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 909566063dSJacob Faibussowitsch PetscCall(MatSeqAIJInvalidateDiagonal(A)); 919566063dSJacob Faibussowitsch PetscCall(PetscObjectStateIncrease((PetscObject)A)); 923ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 9386a27549SJunchao Zhang } 9486a27549SJunchao Zhang 95d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 96d71ae5a4SJacob Faibussowitsch { 97f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 98f0cf5187SStefano Zampini 99f0cf5187SStefano Zampini PetscFunctionBegin; 100f0cf5187SStefano Zampini PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10186a27549SJunchao Zhang /* We do not expect one needs factors on host */ 1025f80ce2aSJacob Faibussowitsch PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Cann't sync factorized matrix from device to host"); 1035f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK"); 104076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 1053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 106f0cf5187SStefano Zampini } 107f0cf5187SStefano Zampini 108d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 109d71ae5a4SJacob Faibussowitsch { 110076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 111f0cf5187SStefano Zampini 112f0cf5187SStefano Zampini PetscFunctionBegin; 1135519a089SJose E. Roman /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's. 1145519a089SJose E. Roman Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be 1155519a089SJose E. Roman reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate 1165519a089SJose E. Roman must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd. 1175519a089SJose E. Roman */ 1185519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 119076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 120076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 121076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 122076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 123076ba34aSJunchao Zhang } 1243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 125076ba34aSJunchao Zhang } 126076ba34aSJunchao Zhang 127d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[]) 128d71ae5a4SJacob Faibussowitsch { 129076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 130076ba34aSJunchao Zhang 131076ba34aSJunchao Zhang PetscFunctionBegin; 1325519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host(); 1333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 134076ba34aSJunchao Zhang } 135076ba34aSJunchao Zhang 136d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 137d71ae5a4SJacob Faibussowitsch { 138076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 139076ba34aSJunchao Zhang 140076ba34aSJunchao Zhang PetscFunctionBegin; 1415519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 142076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 143076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1442328674fSJunchao Zhang } else { 1452328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1462328674fSJunchao Zhang } 1473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 148076ba34aSJunchao Zhang } 149076ba34aSJunchao Zhang 150d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[]) 151d71ae5a4SJacob Faibussowitsch { 152076ba34aSJunchao Zhang PetscFunctionBegin; 153076ba34aSJunchao Zhang *array = NULL; 1543ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 155076ba34aSJunchao Zhang } 156076ba34aSJunchao Zhang 157d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 158d71ae5a4SJacob Faibussowitsch { 159076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 160076ba34aSJunchao Zhang 161076ba34aSJunchao Zhang PetscFunctionBegin; 1625519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 163076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1642328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1652328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ *>(A->data)->a; 1662328674fSJunchao Zhang } 1673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168076ba34aSJunchao Zhang } 169076ba34aSJunchao Zhang 170d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[]) 171d71ae5a4SJacob Faibussowitsch { 172076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 173076ba34aSJunchao Zhang 174076ba34aSJunchao Zhang PetscFunctionBegin; 1755519a089SJose E. Roman if (aijkok && A->nonzerostate == aijkok->nonzerostate) { 176076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 177076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1782328674fSJunchao Zhang } 1793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 180f0cf5187SStefano Zampini } 181f0cf5187SStefano Zampini 182d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype) 183d71ae5a4SJacob Faibussowitsch { 1847ee59b9bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1857ee59b9bSJunchao Zhang 1867ee59b9bSJunchao Zhang PetscFunctionBegin; 1877ee59b9bSJunchao Zhang PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL"); 1887ee59b9bSJunchao Zhang 1897ee59b9bSJunchao Zhang if (i) *i = aijkok->i_device_data(); 1907ee59b9bSJunchao Zhang if (j) *j = aijkok->j_device_data(); 1917ee59b9bSJunchao Zhang if (a) { 1927ee59b9bSJunchao Zhang aijkok->a_dual.sync_device(); 1937ee59b9bSJunchao Zhang *a = aijkok->a_device_data(); 1947ee59b9bSJunchao Zhang } 1957ee59b9bSJunchao Zhang if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; 1963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1977ee59b9bSJunchao Zhang } 1987ee59b9bSJunchao Zhang 199a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 200d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 201d71ae5a4SJacob Faibussowitsch { 202042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 203042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 204a587d139SMark 205a587d139SMark PetscFunctionBegin; 2065f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 207152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(), h_mat_k); 208a587d139SMark Kokkos::deep_copy(aijkok->device_mat_d, h_mat_k); 2093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 210a587d139SMark } 211a587d139SMark 212a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 213d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 214d71ae5a4SJacob Faibussowitsch { 215042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 216a587d139SMark 217a587d139SMark PetscFunctionBegin; 218a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 219a587d139SMark *d_mat = aijkok->device_mat_d.data(); 220a587d139SMark } else { 2219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok (we are making d_mat now so make a place for it) 222a587d139SMark *d_mat = NULL; 223a587d139SMark } 2243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 225a587d139SMark } 226076ba34aSJunchao Zhang 227076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 228d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 229d71ae5a4SJacob Faibussowitsch { 230152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 231152b3e56SJunchao Zhang 232152b3e56SJunchao Zhang PetscFunctionBegin; 2335f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 234076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 235076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 2369566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 237f98996d3SJunchao Zhang PetscCallCXX(aijkok->csrmatT = transpose_matrix(aijkok->csrmat)); 238f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(aijkok->csrmatT)); 23986a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 240076ba34aSJunchao Zhang } 241076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 2423ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 243152b3e56SJunchao Zhang } 244152b3e56SJunchao Zhang 245076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 246d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 247d71ae5a4SJacob Faibussowitsch { 248152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 249152b3e56SJunchao Zhang 250152b3e56SJunchao Zhang PetscFunctionBegin; 2519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2525f80ce2aSJacob Faibussowitsch PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 253076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 2549566063dSJacob Faibussowitsch PetscCallCXX(aijkok->a_dual.sync_device()); 255f98996d3SJunchao Zhang PetscCallCXX(aijkok->csrmatH = transpose_matrix(aijkok->csrmat)); 256f98996d3SJunchao Zhang PetscCallCXX(sort_crs_matrix(aijkok->csrmatH)); 257076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 258076ba34aSJunchao Zhang const auto &a = aijkok->csrmatH.values; 2599371c9d4SSatish Balay Kokkos::parallel_for( 2609371c9d4SSatish Balay a.extent(0), KOKKOS_LAMBDA(MatRowMapType i) { a(i) = PetscConj(a(i)); }); 261076ba34aSJunchao Zhang #endif 26286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 263076ba34aSJunchao Zhang } 264076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 2659566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 267152b3e56SJunchao Zhang } 268a587d139SMark 2698c3ff71bSJunchao Zhang /* y = A x */ 270d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 271d71ae5a4SJacob Faibussowitsch { 2728c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 273152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 274152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2758c3ff71bSJunchao Zhang 2768c3ff71bSJunchao Zhang PetscFunctionBegin; 2779566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 2789566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 2799566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 2809566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 2818c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 2829d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */ 2839566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 2849566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 285076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 2869566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 2879566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 2883ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 2898c3ff71bSJunchao Zhang } 2908c3ff71bSJunchao Zhang 2918c3ff71bSJunchao Zhang /* y = A^T x */ 292d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 293d71ae5a4SJacob Faibussowitsch { 2948c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 295152b3e56SJunchao Zhang const char *mode; 296152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 297152b3e56SJunchao Zhang PetscScalarKokkosView yv; 298076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2998c3ff71bSJunchao Zhang 3008c3ff71bSJunchao Zhang PetscFunctionBegin; 3019566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3029566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3039566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3049566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 305152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 307152b3e56SJunchao Zhang mode = "N"; 308152b3e56SJunchao Zhang } else { 309076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 310076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 311152b3e56SJunchao Zhang mode = "T"; 312152b3e56SJunchao Zhang } 3139d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */ 3149566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3159566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3169566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 3179566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3198c3ff71bSJunchao Zhang } 3208c3ff71bSJunchao Zhang 3218c3ff71bSJunchao Zhang /* y = A^H x */ 322d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy) 323d71ae5a4SJacob Faibussowitsch { 3248c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 325152b3e56SJunchao Zhang const char *mode; 326152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 327152b3e56SJunchao Zhang PetscScalarKokkosView yv; 328076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3298c3ff71bSJunchao Zhang 3308c3ff71bSJunchao Zhang PetscFunctionBegin; 3319566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3339566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3349566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(yy, &yv)); 335152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3369566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 337152b3e56SJunchao Zhang mode = "N"; 338152b3e56SJunchao Zhang } else { 339076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 340076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 341152b3e56SJunchao Zhang mode = "C"; 342152b3e56SJunchao Zhang } 3439d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */ 3449566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3459566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(yy, &yv)); 3469566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 3479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3483ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3498c3ff71bSJunchao Zhang } 3508c3ff71bSJunchao Zhang 3518c3ff71bSJunchao Zhang /* z = A x + y */ 352d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 353d71ae5a4SJacob Faibussowitsch { 3548c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 355152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 356152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3578c3ff71bSJunchao Zhang 3588c3ff71bSJunchao Zhang PetscFunctionBegin; 3599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3609566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3619566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3629566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 3639566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 3648c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 3658c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 3669d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv("N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */ 3679566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 3689566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 3699566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 3709566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz())); 3719566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 3723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3738c3ff71bSJunchao Zhang } 3748c3ff71bSJunchao Zhang 3758c3ff71bSJunchao Zhang /* z = A^T x + y */ 376d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 377d71ae5a4SJacob Faibussowitsch { 3788c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 379152b3e56SJunchao Zhang const char *mode; 380152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 381152b3e56SJunchao Zhang PetscScalarKokkosView zv; 382076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3838c3ff71bSJunchao Zhang 3848c3ff71bSJunchao Zhang PetscFunctionBegin; 3859566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 3869566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 3879566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 3889566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 3899566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 3908c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 391152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 3929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat)); 393152b3e56SJunchao Zhang mode = "N"; 394152b3e56SJunchao Zhang } else { 395076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 396076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 397152b3e56SJunchao Zhang mode = "T"; 398152b3e56SJunchao Zhang } 3999d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */ 4009566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4019566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4029566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 4049566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4068c3ff71bSJunchao Zhang } 4078c3ff71bSJunchao Zhang 4088c3ff71bSJunchao Zhang /* z = A^H x + y */ 409d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz) 410d71ae5a4SJacob Faibussowitsch { 4118c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 412152b3e56SJunchao Zhang const char *mode; 413152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv, yv; 414152b3e56SJunchao Zhang PetscScalarKokkosView zv; 415076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 4168c3ff71bSJunchao Zhang 4178c3ff71bSJunchao Zhang PetscFunctionBegin; 4189566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 4199566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 4209566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(xx, &xv)); 4219566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(yy, &yv)); 4229566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(zz, &zv)); 4238c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv, yv); 424152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 4259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat)); 426152b3e56SJunchao Zhang mode = "N"; 427152b3e56SJunchao Zhang } else { 428076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 429076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 430152b3e56SJunchao Zhang mode = "C"; 431152b3e56SJunchao Zhang } 4329d52486cSJunchao Zhang PetscCallCXX(KokkosSparse::spmv(mode, 1.0 /*alpha*/, *csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */ 4339566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(xx, &xv)); 4349566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(yy, &yv)); 4359566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(zz, &zv)); 4369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(2.0 * csrmat->nnz())); 4379566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 4383ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 439152b3e56SJunchao Zhang } 440152b3e56SJunchao Zhang 441d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg) 442d71ae5a4SJacob Faibussowitsch { 443152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 444152b3e56SJunchao Zhang 445152b3e56SJunchao Zhang PetscFunctionBegin; 446152b3e56SJunchao Zhang switch (op) { 447152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 448152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 4499566063dSJacob Faibussowitsch if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose()); 450152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 451152b3e56SJunchao Zhang break; 452d71ae5a4SJacob Faibussowitsch default: 453d71ae5a4SJacob Faibussowitsch PetscCall(MatSetOption_SeqAIJ(A, op, flg)); 454d71ae5a4SJacob Faibussowitsch break; 455152b3e56SJunchao Zhang } 4563ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4578c3ff71bSJunchao Zhang } 4588c3ff71bSJunchao Zhang 459076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 460d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat) 461d71ae5a4SJacob Faibussowitsch { 462076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4638c3ff71bSJunchao Zhang 4648c3ff71bSJunchao Zhang PetscFunctionBegin; 4659566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 466076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 4679566063dSJacob Faibussowitsch PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat)); /* the returned newmat is a SeqAIJKokkos */ 4688c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 4699566063dSJacob Faibussowitsch PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */ 470076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 4715f80ce2aSJacob Faibussowitsch PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX"); 4729566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype)); 4739566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */ 4749566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS)); 4759566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(A)); 476076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(A->data); 477394ed5ebSJunchao Zhang if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */ 4785f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 479076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n, A->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, A->nonzerostate, PETSC_FALSE); 4808c3ff71bSJunchao Zhang } 481076ba34aSJunchao Zhang } 4823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4838c3ff71bSJunchao Zhang } 4848c3ff71bSJunchao Zhang 485076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 486076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 487076ba34aSJunchao Zhang */ 488d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B) 489d71ae5a4SJacob Faibussowitsch { 490076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 491076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok; 492076ba34aSJunchao Zhang Mat mat; 4938c3ff71bSJunchao Zhang 4948c3ff71bSJunchao Zhang PetscFunctionBegin; 495076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 4969566063dSJacob Faibussowitsch PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B)); 497076ba34aSJunchao Zhang mat = *B; 498076ba34aSJunchao Zhang if (A->assembled) { 499076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ *>(mat->data); 500076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, bseq->nz, bseq->i, bseq->j, bseq->a, mat->nonzerostate, PETSC_FALSE); 501076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 502076ba34aSJunchao Zhang /* Now copy values to B if needed */ 503076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 504076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 505076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host()); 506076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 507076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 508076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device()); 509076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 510076ba34aSJunchao Zhang } 511076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 512076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 513076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 514076ba34aSJunchao Zhang } 515076ba34aSJunchao Zhang mat->spptr = bkok; 516076ba34aSJunchao Zhang } 517076ba34aSJunchao Zhang 5189566063dSJacob Faibussowitsch PetscCall(PetscFree(mat->defaultvectype)); 5199566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */ 5209566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS)); 5219566063dSJacob Faibussowitsch PetscCall(MatSetOps_SeqAIJKokkos(mat)); 5223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5238c3ff71bSJunchao Zhang } 5248c3ff71bSJunchao Zhang 525d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B) 526d71ae5a4SJacob Faibussowitsch { 5270ecb592aSJunchao Zhang Mat At; 528ff751488SJunchao Zhang KokkosCsrMatrix *internT; 5290ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok, *bkok; 5300ecb592aSJunchao Zhang 5310ecb592aSJunchao Zhang PetscFunctionBegin; 5327fb60732SBarry Smith if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B)); 5339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */ 5340ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 535ff751488SJunchao Zhang /* Deep copy internT, as we want to isolate the internal transpose */ 5369566063dSJacob Faibussowitsch PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", *internT))); 5379566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At)); 5380ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5399566063dSJacob Faibussowitsch else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */ 5400ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5410ecb592aSJunchao Zhang if ((*B)->assembled) { 5420ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr); 5439566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT->values)); 5449566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*B)); 5450ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5460ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data); 5470ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a, internT->nnz()); /* bseq->nz = 0 if unassembled */ 5480ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j, internT->nnz()); 5499566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(a_h, internT->values)); 5509566063dSJacob Faibussowitsch PetscCallCXX(Kokkos::deep_copy(j_h, internT->graph.entries)); 5510ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated"); 5520ecb592aSJunchao Zhang } 5533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5540ecb592aSJunchao Zhang } 5550ecb592aSJunchao Zhang 556d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 557d71ae5a4SJacob Faibussowitsch { 55886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5598c3ff71bSJunchao Zhang 5608c3ff71bSJunchao Zhang PetscFunctionBegin; 56186a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 56286a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 5638c3ff71bSJunchao Zhang delete aijkok; 56486a27549SJunchao Zhang } else { 56586a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr); 56686a27549SJunchao Zhang } 567cbc6b225SStefano Zampini A->spptr = NULL; 5689566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL)); 5699566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL)); 5709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL)); 5719566063dSJacob Faibussowitsch PetscCall(MatDestroy_SeqAIJ(A)); 5723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 5738c3ff71bSJunchao Zhang } 5748c3ff71bSJunchao Zhang 5753f3ba80aSJunchao Zhang /*MC 5763f3ba80aSJunchao Zhang MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos 5773f3ba80aSJunchao Zhang 5783f3ba80aSJunchao Zhang A matrix type type using Kokkos-Kernels CrsMatrix type for portability across different device types 5793f3ba80aSJunchao Zhang 580*2ef1f0ffSBarry Smith Options Database Key: 58111a5261eSBarry Smith . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()` 5823f3ba80aSJunchao Zhang 5833f3ba80aSJunchao Zhang Level: beginner 5843f3ba80aSJunchao Zhang 585*2ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS` 5863f3ba80aSJunchao Zhang M*/ 587d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 588d71ae5a4SJacob Faibussowitsch { 58986a27549SJunchao Zhang PetscFunctionBegin; 5909566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 5919566063dSJacob Faibussowitsch PetscCall(MatCreate_SeqAIJ(A)); 5929566063dSJacob Faibussowitsch PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A)); 5933ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 59486a27549SJunchao Zhang } 59586a27549SJunchao Zhang 596076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C) 598d71ae5a4SJacob Faibussowitsch { 599076ba34aSJunchao Zhang Mat_SeqAIJ *a, *b; 600076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 601076ba34aSJunchao Zhang MatScalarKokkosView aa, ba, ca; 602076ba34aSJunchao Zhang MatRowMapKokkosView ai, bi, ci; 603076ba34aSJunchao Zhang MatColIdxKokkosView aj, bj, cj; 604076ba34aSJunchao Zhang PetscInt m, n, nnz, aN; 605a3f881fbSStefano Zampini 606a3f881fbSStefano Zampini PetscFunctionBegin; 607076ba34aSJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 608076ba34aSJunchao Zhang PetscValidHeaderSpecific(B, MAT_CLASSID, 2); 609076ba34aSJunchao Zhang PetscValidPointer(C, 4); 610076ba34aSJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 611076ba34aSJunchao Zhang PetscCheckTypeName(B, MATSEQAIJKOKKOS); 6125f80ce2aSJacob Faibussowitsch PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n); 6135f80ce2aSJacob Faibussowitsch PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported"); 614076ba34aSJunchao Zhang 6159566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 6169566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 617076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ *>(A->data); 618076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ *>(B->data); 619076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 620076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 621076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 622076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 623076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 624076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 625076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 626076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 627076ba34aSJunchao Zhang nnz = a->nz + b->nz; 628076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 629076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 630076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 631076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 632076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a", aa.extent(0) + ba.extent(0)); 633076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i", ai.extent(0)); 634076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j", aj.extent(0) + bj.extent(0)); 635076ba34aSJunchao Zhang ca = ca_dual.view_device(); 636076ba34aSJunchao Zhang ci = ci_dual.view_device(); 637076ba34aSJunchao Zhang cj = cj_dual.view_device(); 638076ba34aSJunchao Zhang 639076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 6409371c9d4SSatish Balay Kokkos::parallel_for( 6419371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 642076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 643076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 644076ba34aSJunchao Zhang 645076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 646076ba34aSJunchao Zhang ci(i) = coffset; 647076ba34aSJunchao Zhang if (i == m - 1) ci(m) = ai(m) + bi(m); 648076ba34aSJunchao Zhang }); 649076ba34aSJunchao Zhang 650076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 651076ba34aSJunchao Zhang if (k < alen) { 652076ba34aSJunchao Zhang ca(coffset + k) = aa(ai(i) + k); 653076ba34aSJunchao Zhang cj(coffset + k) = aj(ai(i) + k); 654076ba34aSJunchao Zhang } else { 655076ba34aSJunchao Zhang ca(coffset + k) = ba(bi(i) + k - alen); 656076ba34aSJunchao Zhang cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */ 657076ba34aSJunchao Zhang } 658076ba34aSJunchao Zhang }); 659076ba34aSJunchao Zhang }); 660076ba34aSJunchao Zhang ca_dual.modify_device(); 661076ba34aSJunchao Zhang ci_dual.modify_device(); 662076ba34aSJunchao Zhang cj_dual.modify_device(); 6639566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci_dual, cj_dual, ca_dual)); 6649566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C)); 665076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 666076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C, MAT_CLASSID, 4); 667076ba34aSJunchao Zhang PetscCheckTypeName(*C, MATSEQAIJKOKKOS); 668076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr); 669076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 670076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 671076ba34aSJunchao Zhang 6729371c9d4SSatish Balay Kokkos::parallel_for( 6739371c9d4SSatish Balay Kokkos::TeamPolicy<>(m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 674076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 675076ba34aSJunchao Zhang PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i); 676076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) { 677076ba34aSJunchao Zhang if (k < alen) ca(ci(i) + k) = aa(ai(i) + k); 678076ba34aSJunchao Zhang else ca(ci(i) + k) = ba(bi(i) + k - alen); 679076ba34aSJunchao Zhang }); 680076ba34aSJunchao Zhang }); 6819566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(*C)); 682076ba34aSJunchao Zhang } 6833ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 684076ba34aSJunchao Zhang } 685076ba34aSJunchao Zhang 686d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void *pdata) 687d71ae5a4SJacob Faibussowitsch { 688076ba34aSJunchao Zhang PetscFunctionBegin; 689076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos *>(pdata); 6903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 691a3f881fbSStefano Zampini } 692a3f881fbSStefano Zampini 693d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 694d71ae5a4SJacob Faibussowitsch { 695a3f881fbSStefano Zampini Mat_Product *product = C->product; 696a3f881fbSStefano Zampini Mat A, B; 697076ba34aSJunchao Zhang bool transA, transB; /* use bool, since KK needs this type */ 698a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok, *bkok, *ckok; 699a3f881fbSStefano Zampini Mat_SeqAIJ *c; 700076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 701076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA, *csrmatB; 702a3f881fbSStefano Zampini 703a3f881fbSStefano Zampini PetscFunctionBegin; 704a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7055f80ce2aSJacob Faibussowitsch PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty"); 706076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos *>(C->product->data); 707076ba34aSJunchao Zhang 708076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 709076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 7103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 711076ba34aSJunchao Zhang } 712076ba34aSJunchao Zhang 713076ba34aSJunchao Zhang switch (product->type) { 7149371c9d4SSatish Balay case MATPRODUCT_AB: 7159371c9d4SSatish Balay transA = false; 7169371c9d4SSatish Balay transB = false; 7179371c9d4SSatish Balay break; 7189371c9d4SSatish Balay case MATPRODUCT_AtB: 7199371c9d4SSatish Balay transA = true; 7209371c9d4SSatish Balay transB = false; 7219371c9d4SSatish Balay break; 7229371c9d4SSatish Balay case MATPRODUCT_ABt: 7239371c9d4SSatish Balay transA = false; 7249371c9d4SSatish Balay transB = true; 7259371c9d4SSatish Balay break; 726d71ae5a4SJacob Faibussowitsch default: 727d71ae5a4SJacob Faibussowitsch SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 728076ba34aSJunchao Zhang } 729076ba34aSJunchao Zhang 730a3f881fbSStefano Zampini A = product->A; 731a3f881fbSStefano Zampini B = product->B; 7329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7339566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 734a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 735a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 736a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr); 737076ba34aSJunchao Zhang 7385f80ce2aSJacob Faibussowitsch PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty"); 739076ba34aSJunchao Zhang 740076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 741076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 742076ba34aSJunchao Zhang 743076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 744076ba34aSJunchao Zhang if (transA) { 7459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 746076ba34aSJunchao Zhang transA = false; 747a3f881fbSStefano Zampini } 748a3f881fbSStefano Zampini 749076ba34aSJunchao Zhang if (transB) { 7509566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 751076ba34aSJunchao Zhang transB = false; 752076ba34aSJunchao Zhang } 7539566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 7549566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, ckok->csrmat)); 755e944a159SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 756866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 757866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */ 758e944a159SJunchao Zhang #endif 759866eb059SJunchao Zhang 7609566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 7619566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(C)); 762a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 763a3f881fbSStefano Zampini c = (Mat_SeqAIJ *)C->data; 7649566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz)); 7659566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n")); 7669566063dSJacob Faibussowitsch PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax)); 767a3f881fbSStefano Zampini c->reallocs = 0; 768076ba34aSJunchao Zhang C->info.mallocs = 0; 769a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 770a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 771a3f881fbSStefano Zampini C->num_ass++; 7723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 773a3f881fbSStefano Zampini } 774a3f881fbSStefano Zampini 775d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 776d71ae5a4SJacob Faibussowitsch { 777076ba34aSJunchao Zhang Mat_Product *product = C->product; 778076ba34aSJunchao Zhang MatProductType ptype; 779076ba34aSJunchao Zhang Mat A, B; 780076ba34aSJunchao Zhang bool transA, transB; 781076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok, *bkok, *ckok; 782076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 783076ba34aSJunchao Zhang MPI_Comm comm; 784076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA, *csrmatB, csrmatC; 785a3f881fbSStefano Zampini 786a3f881fbSStefano Zampini PetscFunctionBegin; 787a3f881fbSStefano Zampini MatCheckProduct(C, 1); 7889566063dSJacob Faibussowitsch PetscCall(PetscObjectGetComm((PetscObject)C, &comm)); 7895f80ce2aSJacob Faibussowitsch PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty"); 790a3f881fbSStefano Zampini A = product->A; 791a3f881fbSStefano Zampini B = product->B; 7929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 7939566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); 794a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 795a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 796076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 797076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 798076ba34aSJunchao Zhang 799a3f881fbSStefano Zampini ptype = product->type; 800a3f881fbSStefano Zampini switch (ptype) { 8019371c9d4SSatish Balay case MATPRODUCT_AB: 8029371c9d4SSatish Balay transA = false; 8039371c9d4SSatish Balay transB = false; 8049371c9d4SSatish Balay break; 8059371c9d4SSatish Balay case MATPRODUCT_AtB: 8069371c9d4SSatish Balay transA = true; 8079371c9d4SSatish Balay transB = false; 8089371c9d4SSatish Balay break; 8099371c9d4SSatish Balay case MATPRODUCT_ABt: 8109371c9d4SSatish Balay transA = false; 8119371c9d4SSatish Balay transB = true; 8129371c9d4SSatish Balay break; 813d71ae5a4SJacob Faibussowitsch default: 814d71ae5a4SJacob Faibussowitsch SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]); 815a3f881fbSStefano Zampini } 816a3f881fbSStefano Zampini 817076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 818076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 819076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 820076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 821a3f881fbSStefano Zampini 822076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 823866eb059SJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */ 824866eb059SJunchao Zhang 825866eb059SJunchao Zhang /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */ 826866eb059SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 827866eb059SJunchao Zhang #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0) 828866eb059SJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 829866eb059SJunchao Zhang #endif 830866eb059SJunchao Zhang #endif 831866eb059SJunchao Zhang 832076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 833076ba34aSJunchao Zhang 8349566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 835076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 836076ba34aSJunchao Zhang if (transA) { 8379566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA)); 838076ba34aSJunchao Zhang transA = false; 839076ba34aSJunchao Zhang } 840076ba34aSJunchao Zhang 841076ba34aSJunchao Zhang if (transB) { 8429566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB)); 843076ba34aSJunchao Zhang transB = false; 844076ba34aSJunchao Zhang } 845076ba34aSJunchao Zhang 8469566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 847866eb059SJunchao Zhang 848076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 849076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 850076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 851076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 852076ba34aSJunchao Zhang */ 8539566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, *csrmatA, transA, *csrmatB, transB, csrmatC)); 854e944a159SJunchao Zhang #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(3, 7, 99) 855866eb059SJunchao Zhang /* Query if KK outputs a sorted matrix. If not, we need to sort it */ 856866eb059SJunchao Zhang auto spgemmHandle = pdata->kh.get_spgemm_handle(); 857866eb059SJunchao Zhang if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/ 858e944a159SJunchao Zhang #endif 8599566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 860076ba34aSJunchao Zhang 8619566063dSJacob Faibussowitsch PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 8629566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok)); 863076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 8643ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 865a3f881fbSStefano Zampini } 866a3f881fbSStefano Zampini 867a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 868d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 869d71ae5a4SJacob Faibussowitsch { 870076ba34aSJunchao Zhang Mat_Product *product = mat->product; 871a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE; 872a3f881fbSStefano Zampini 873a3f881fbSStefano Zampini PetscFunctionBegin; 874a3f881fbSStefano Zampini MatCheckProduct(mat, 1); 8759566063dSJacob Faibussowitsch PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok)); 87648a46eb9SPierre Jolivet if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok)); 877a3f881fbSStefano Zampini if (Biskok && Ciskok) { 878a3f881fbSStefano Zampini switch (product->type) { 879a3f881fbSStefano Zampini case MATPRODUCT_AB: 880a3f881fbSStefano Zampini case MATPRODUCT_AtB: 881d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABt: 882d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 883d71ae5a4SJacob Faibussowitsch break; 884a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 885a3f881fbSStefano Zampini case MATPRODUCT_RARt: 886d71ae5a4SJacob Faibussowitsch case MATPRODUCT_ABC: 887d71ae5a4SJacob Faibussowitsch mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 888d71ae5a4SJacob Faibussowitsch break; 889d71ae5a4SJacob Faibussowitsch default: 890d71ae5a4SJacob Faibussowitsch break; 891a3f881fbSStefano Zampini } 892a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 8939566063dSJacob Faibussowitsch PetscCall(MatProductSetFromOptions_SeqAIJ(mat)); 894a3f881fbSStefano Zampini } 8953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 896a3f881fbSStefano Zampini } 897a587d139SMark 898d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 899d71ae5a4SJacob Faibussowitsch { 900f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 901f0cf5187SStefano Zampini 902f0cf5187SStefano Zampini PetscFunctionBegin; 9039566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 9049566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 905f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 906076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device()); 9079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9089566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0))); 9099566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 9103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 911f0cf5187SStefano Zampini } 912f0cf5187SStefano Zampini 913d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 914d71ae5a4SJacob Faibussowitsch { 915076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 916a587d139SMark 917a587d139SMark PetscFunctionBegin; 918076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 9192328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 920076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(), 0.0); 9219566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 9222328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 9239566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJ(A)); 9242328674fSJunchao Zhang } 9253ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 926a587d139SMark } 927a587d139SMark 928d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x) 929d71ae5a4SJacob Faibussowitsch { 930f78ce678SMark Adams Mat_SeqAIJ *aijseq; 931f78ce678SMark Adams Mat_SeqAIJKokkos *aijkok; 932f78ce678SMark Adams PetscInt n; 933f78ce678SMark Adams PetscScalarKokkosView xv; 934f78ce678SMark Adams 935f78ce678SMark Adams PetscFunctionBegin; 936f78ce678SMark Adams PetscCall(VecGetLocalSize(x, &n)); 937f78ce678SMark Adams PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector"); 938f78ce678SMark Adams PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices"); 939f78ce678SMark Adams 940f78ce678SMark Adams PetscCall(MatSeqAIJKokkosSyncDevice(A)); 941f78ce678SMark Adams aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 942f78ce678SMark Adams 943f78ce678SMark Adams if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { /* Set the diagonal pointer if not already */ 944f78ce678SMark Adams PetscCall(MatMarkDiagonal_SeqAIJ(A)); 945f78ce678SMark Adams aijseq = static_cast<Mat_SeqAIJ *>(A->data); 946f78ce678SMark Adams aijkok->SetDiagonal(aijseq->diag); 947f78ce678SMark Adams } 948f78ce678SMark Adams 949f78ce678SMark Adams const auto &Aa = aijkok->a_dual.view_device(); 950f78ce678SMark Adams const auto &Ai = aijkok->i_dual.view_device(); 951f78ce678SMark Adams const auto &Adiag = aijkok->diag_dual.view_device(); 952f78ce678SMark Adams 953f78ce678SMark Adams PetscCall(VecGetKokkosViewWrite(x, &xv)); 9549371c9d4SSatish Balay Kokkos::parallel_for( 9559371c9d4SSatish Balay n, KOKKOS_LAMBDA(const PetscInt i) { 956f78ce678SMark Adams if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i)); 957f78ce678SMark Adams else xv(i) = 0; 958f78ce678SMark Adams }); 959f78ce678SMark Adams PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 9603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 961f78ce678SMark Adams } 962f78ce678SMark Adams 963db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 964d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv) 965d71ae5a4SJacob Faibussowitsch { 966db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 967db78de30SJunchao Zhang 968db78de30SJunchao Zhang PetscFunctionBegin; 969db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 970db78de30SJunchao Zhang PetscValidPointer(kv, 2); 971db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 973db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 974076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 9753ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 976db78de30SJunchao Zhang } 977db78de30SJunchao Zhang 978d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv) 979d71ae5a4SJacob Faibussowitsch { 980db78de30SJunchao Zhang PetscFunctionBegin; 981db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 982db78de30SJunchao Zhang PetscValidPointer(kv, 2); 983db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9843ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 985db78de30SJunchao Zhang } 986db78de30SJunchao Zhang 987d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv) 988d71ae5a4SJacob Faibussowitsch { 989db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 990db78de30SJunchao Zhang 991db78de30SJunchao Zhang PetscFunctionBegin; 992db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 993db78de30SJunchao Zhang PetscValidPointer(kv, 2); 994db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 9959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 996db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 997076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 9983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 999db78de30SJunchao Zhang } 1000db78de30SJunchao Zhang 1001d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv) 1002d71ae5a4SJacob Faibussowitsch { 1003db78de30SJunchao Zhang PetscFunctionBegin; 1004db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1005db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1006db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10079566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1009db78de30SJunchao Zhang } 1010db78de30SJunchao Zhang 1011d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1012d71ae5a4SJacob Faibussowitsch { 1013db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 1014db78de30SJunchao Zhang 1015db78de30SJunchao Zhang PetscFunctionBegin; 1016db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1017db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1018db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 1019db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1020076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 10213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1022db78de30SJunchao Zhang } 1023db78de30SJunchao Zhang 1024d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv) 1025d71ae5a4SJacob Faibussowitsch { 1026db78de30SJunchao Zhang PetscFunctionBegin; 1027db78de30SJunchao Zhang PetscValidHeaderSpecific(A, MAT_CLASSID, 1); 1028db78de30SJunchao Zhang PetscValidPointer(kv, 2); 1029db78de30SJunchao Zhang PetscCheckTypeName(A, MATSEQAIJKOKKOS); 10309566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(A)); 10313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1032db78de30SJunchao Zhang } 1033db78de30SJunchao Zhang 1034c17cf699SJunchao Zhang /* Computes Y += alpha X */ 1035d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern) 1036d71ae5a4SJacob Faibussowitsch { 1037a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data; 1038c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok, *ykok, *zkok; 1039c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 1040c17cf699SJunchao Zhang MatScalarKokkosView Ya; 1041a587d139SMark 1042a587d139SMark PetscFunctionBegin; 1043c17cf699SJunchao Zhang PetscCheckTypeName(Y, MATSEQAIJKOKKOS); 1044c17cf699SJunchao Zhang PetscCheckTypeName(X, MATSEQAIJKOKKOS); 10459566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(Y)); 10469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(X)); 10479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 1048db78de30SJunchao Zhang 1049c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 1050c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 1051a587d139SMark PetscBool e; 10529566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e)); 1053a587d139SMark if (e) { 10549566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e)); 1055c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 1056a587d139SMark } 1057a587d139SMark } 1058db78de30SJunchao Zhang 1059c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 1060c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 1061c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 1062c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 1063c17cf699SJunchao Zhang */ 1064c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr); 1065c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr); 1066c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 1067c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 1068c17cf699SJunchao Zhang 1069c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 1070c17cf699SJunchao Zhang KokkosBlas::axpy(alpha, Xa, Ya); 10719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1072c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 1073c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device(); 1074c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device(); 1075c17cf699SJunchao Zhang 10769371c9d4SSatish Balay Kokkos::parallel_for( 10779371c9d4SSatish Balay Kokkos::TeamPolicy<>(Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) { 1078c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1079c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* Only one thread works in a team */ 1080c17cf699SJunchao Zhang PetscInt p, q = Yi(i); 1081c17cf699SJunchao Zhang for (p = Xi(i); p < Xi(i + 1); p++) { /* For each nonzero on row i of X */ 1082c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; /* find the matching nonzero on row i of Y */ 1083c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1084c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1085c17cf699SJunchao Zhang q++; 1086a587d139SMark } else { 1087c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1088c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1089c17cf699SJunchao Zhang */ 10909371c9d4SSatish Balay if (Yi(i) != Yi(i + 1)) 10919371c9d4SSatish Balay Ya(Yi(i)) = 10928b8b16f9SJunchao Zhang #if PETSC_PKG_KOKKOS_VERSION_GE(3, 6, 99) 10938d80707aSStefano Zampini Kokkos::ArithTraits<PetscScalar>::nan(); 10948b8b16f9SJunchao Zhang #else 10958b8b16f9SJunchao Zhang Kokkos::Experimental::nan("1"); 10968b8b16f9SJunchao Zhang #endif 1097a587d139SMark } 1098c17cf699SJunchao Zhang } 1099c17cf699SJunchao Zhang }); 1100c17cf699SJunchao Zhang }); 11019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosModifyDevice(Y)); 1102c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1103c17cf699SJunchao Zhang Mat Z; 1104c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1105c17cf699SJunchao Zhang KernelHandle kh; 1106c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1107c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr); 1108c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr); 1109c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 11109566063dSJacob Faibussowitsch PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z)); 11119566063dSJacob Faibussowitsch PetscCall(MatHeaderReplace(Y, &Z)); 1112c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1113c17cf699SJunchao Zhang } 11149566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 11159566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); /* Because we scaled X and then added it to Y */ 11163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1117a587d139SMark } 1118a587d139SMark 1119d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[]) 1120d71ae5a4SJacob Faibussowitsch { 112142550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 112242550becSJunchao Zhang Mat_SeqAIJ *aseq; 112342550becSJunchao Zhang 112442550becSJunchao Zhang PetscFunctionBegin; 11259566063dSJacob Faibussowitsch PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j)); 1126394ed5ebSJunchao Zhang aseq = static_cast<Mat_SeqAIJ *>(mat->data); 112742550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr); 1128cbc6b225SStefano Zampini delete akok; 1129cbc6b225SStefano Zampini mat->spptr = akok = new Mat_SeqAIJKokkos(mat->rmap->n, mat->cmap->n, aseq->nz, aseq->i, aseq->j, aseq->a, mat->nonzerostate + 1, PETSC_FALSE); 11309566063dSJacob Faibussowitsch PetscCall(MatZeroEntries_SeqAIJKokkos(mat)); 1131394ed5ebSJunchao Zhang akok->SetUpCOO(aseq); 11323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 113342550becSJunchao Zhang } 113442550becSJunchao Zhang 1135d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode) 1136d71ae5a4SJacob Faibussowitsch { 113742550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data); 113842550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 1139394ed5ebSJunchao Zhang PetscCount Annz = aseq->nz; 1140394ed5ebSJunchao Zhang const PetscCountKokkosView &jmap = akok->jmap_d; 1141394ed5ebSJunchao Zhang const PetscCountKokkosView &perm = akok->perm_d; 114242550becSJunchao Zhang MatScalarKokkosView Aa; 114342550becSJunchao Zhang ConstMatScalarKokkosView kv; 114442550becSJunchao Zhang PetscMemType memtype; 114542550becSJunchao Zhang 114642550becSJunchao Zhang PetscFunctionBegin; 11479566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype)); 114842550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 1149394ed5ebSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, aseq->coo_n)); 115042550becSJunchao Zhang } else { 1151394ed5ebSJunchao Zhang kv = ConstMatScalarKokkosView(v, aseq->coo_n); /* Directly use v[]'s memory */ 115242550becSJunchao Zhang } 115342550becSJunchao Zhang 1154c7b718f4SJunchao Zhang if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */ 1155c7b718f4SJunchao Zhang else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */ 115642550becSJunchao Zhang 11579371c9d4SSatish Balay Kokkos::parallel_for( 11589371c9d4SSatish Balay Annz, KOKKOS_LAMBDA(const PetscCount i) { 1159c7b718f4SJunchao Zhang PetscScalar sum = 0.0; 1160c7b718f4SJunchao Zhang for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k)); 1161c7b718f4SJunchao Zhang Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum; 1162c7b718f4SJunchao Zhang }); 1163394ed5ebSJunchao Zhang 11649566063dSJacob Faibussowitsch if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 11659566063dSJacob Faibussowitsch else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa)); 11663ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 116742550becSJunchao Zhang } 116842550becSJunchao Zhang 1169d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSeqAIJMoveDiagonalValuesFront_SeqAIJKokkos(Mat A, const PetscInt *diag) 1170d71ae5a4SJacob Faibussowitsch { 11715fbaff96SJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr); 11725fbaff96SJunchao Zhang MatScalarKokkosView Aa; 11735fbaff96SJunchao Zhang const MatRowMapKokkosView &Ai = akok->i_dual.view_device(); 11745fbaff96SJunchao Zhang PetscInt m = A->rmap->n; 11755fbaff96SJunchao Zhang ConstMatRowMapKokkosView Adiag(diag, m); /* diag is a device pointer */ 11765fbaff96SJunchao Zhang 11775fbaff96SJunchao Zhang PetscFunctionBegin; 11785fbaff96SJunchao Zhang PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); 11799371c9d4SSatish Balay Kokkos::parallel_for( 11809371c9d4SSatish Balay m, KOKKOS_LAMBDA(const PetscInt i) { 11815fbaff96SJunchao Zhang PetscScalar tmp; 11825fbaff96SJunchao Zhang if (Adiag(i) >= Ai(i) && Adiag(i) < Ai(i + 1)) { /* The diagonal element exists */ 11835fbaff96SJunchao Zhang tmp = Aa(Ai(i)); 11845fbaff96SJunchao Zhang Aa(Ai(i)) = Aa(Adiag(i)); 11855fbaff96SJunchao Zhang Aa(Adiag(i)) = tmp; 11865fbaff96SJunchao Zhang } 11875fbaff96SJunchao Zhang }); 11885fbaff96SJunchao Zhang PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa)); 11893ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11905fbaff96SJunchao Zhang } 11915fbaff96SJunchao Zhang 1192d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1193d71ae5a4SJacob Faibussowitsch { 11948f7e8f9dSMark Adams PetscFunctionBegin; 11959566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(A)); 11969566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info)); 11978f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 11983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 11998f7e8f9dSMark Adams } 12008f7e8f9dSMark Adams 1201d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 1202d71ae5a4SJacob Faibussowitsch { 1203076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1204076ba34aSJunchao Zhang 12058c3ff71bSJunchao Zhang PetscFunctionBegin; 1206076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 12076f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 12086f3d89d0SStefano Zampini 12098c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 12108c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 12118c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1212a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1213f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1214a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1215076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 12168c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 12178c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 12188c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 12198c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 12208c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 12218c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1222076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 12230ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1224152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1225f78ce678SMark Adams A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos; 1226076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1227076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1228076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1229076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1230076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1231076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 12327ee59b9bSJunchao Zhang a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos; 123342550becSJunchao Zhang 12349566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos)); 12359566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos)); 12363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1237076ba34aSJunchao Zhang } 1238076ba34aSJunchao Zhang 1239d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok) 1240d71ae5a4SJacob Faibussowitsch { 1241076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1242076ba34aSJunchao Zhang PetscInt i, m, n; 1243076ba34aSJunchao Zhang 1244076ba34aSJunchao Zhang PetscFunctionBegin; 12455f80ce2aSJacob Faibussowitsch PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty"); 1246076ba34aSJunchao Zhang 1247076ba34aSJunchao Zhang m = akok->nrows(); 1248076ba34aSJunchao Zhang n = akok->ncols(); 12499566063dSJacob Faibussowitsch PetscCall(MatSetSizes(A, m, n, m, n)); 12509566063dSJacob Faibussowitsch PetscCall(MatSetType(A, MATSEQAIJKOKKOS)); 1251076ba34aSJunchao Zhang 1252076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 12539566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL)); 1254076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ *)(A)->data; 1255076ba34aSJunchao Zhang 1256076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1257076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1258076ba34aSJunchao Zhang 1259076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1260076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1261076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1262076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1263076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1264076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1265076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1266076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1267076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1268076ba34aSJunchao Zhang 12699566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->imax)); 12709566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(m, &aseq->ilen)); 1271ad540459SPierre Jolivet for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i]; 1272076ba34aSJunchao Zhang 1273076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1275ff751488SJunchao Zhang A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */ 12769566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY)); 12779566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY)); 12783ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1279076ba34aSJunchao Zhang } 1280076ba34aSJunchao Zhang 1281076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282076ba34aSJunchao Zhang 1283076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284076ba34aSJunchao Zhang */ 1285d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A) 1286d71ae5a4SJacob Faibussowitsch { 1287076ba34aSJunchao Zhang PetscFunctionBegin; 12889566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 12899566063dSJacob Faibussowitsch PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok)); 12903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12918c3ff71bSJunchao Zhang } 12928c3ff71bSJunchao Zhang 1293152b3e56SJunchao Zhang /*@C 129411a5261eSBarry Smith MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format 12958c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12968c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 12978c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 12988c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 12998c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 13008c3ff71bSJunchao Zhang 13018c3ff71bSJunchao Zhang Collective 13028c3ff71bSJunchao Zhang 13038c3ff71bSJunchao Zhang Input Parameters: 130411a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF` 13058c3ff71bSJunchao Zhang . m - number of rows 13068c3ff71bSJunchao Zhang . n - number of columns 13078c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 13088c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 1309*2ef1f0ffSBarry Smith (possibly different for each row) or `NULL` 13108c3ff71bSJunchao Zhang 13118c3ff71bSJunchao Zhang Output Parameter: 13128c3ff71bSJunchao Zhang . A - the matrix 13138c3ff71bSJunchao Zhang 1314*2ef1f0ffSBarry Smith Level: intermediate 1315*2ef1f0ffSBarry Smith 1316*2ef1f0ffSBarry Smith Notes: 131711a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`, 13188c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 131911a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`] 13208c3ff71bSJunchao Zhang 1321*2ef1f0ffSBarry Smith If `nnz` is given then `nz` is ignored 13228c3ff71bSJunchao Zhang 132311a5261eSBarry Smith The AIJ format, also called 1324*2ef1f0ffSBarry Smith compressed row storage, is fully compatible with standard Fortran 13258c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 13268c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 13278c3ff71bSJunchao Zhang 1328*2ef1f0ffSBarry Smith Specify the preallocated storage with either `nz` or `nnz` (not both). 1329*2ef1f0ffSBarry Smith Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory 1330*2ef1f0ffSBarry Smith allocation. 13318c3ff71bSJunchao Zhang 13328c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 13338c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 13348c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 13358c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 13368c3ff71bSJunchao Zhang 1337*2ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()` 13388c3ff71bSJunchao Zhang @*/ 1339d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A) 1340d71ae5a4SJacob Faibussowitsch { 13418c3ff71bSJunchao Zhang PetscFunctionBegin; 13429566063dSJacob Faibussowitsch PetscCall(PetscKokkosInitializeCheck()); 13439566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A)); 13449566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, m, n)); 13459566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJKOKKOS)); 13469566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz)); 13473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 13488c3ff71bSJunchao Zhang } 1349930e68a5SMark Adams 13508f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 13518f7e8f9dSMark Adams // 135246804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 13538f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 13548f7e8f9dSMark Adams // 1355d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B, Mat A, const MatFactorInfo *info) 1356d71ae5a4SJacob Faibussowitsch { 13578f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 13588f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 13598f7e8f9dSMark Adams IS isrow = b->row, isicol = b->icol; 13608f7e8f9dSMark Adams const PetscInt *r_h, *ic_h; 1361300d22a6SJunchao Zhang const PetscInt n = A->rmap->n, *ai_d = aijkok->i_dual.view_device().data(), *aj_d = aijkok->j_dual.view_device().data(), *bi_d = baijkok->i_dual.view_device().data(), *bj_d = baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d.data(); 1362076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1363076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 13648f7e8f9dSMark Adams PetscBool row_identity, col_identity; 136546804e07SMark Adams PetscInt nc, Nf = 1, nVec = 32; // should be a parameter, Nf is batch size - not used 1366930e68a5SMark Adams 1367930e68a5SMark Adams PetscFunctionBegin; 13682c71b3e2SJacob Faibussowitsch PetscCheck(A->rmap->n == n, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT, A->rmap->n, n); 1369b94d7dedSBarry Smith PetscCall(MatIsStructurallySymmetric(A, &row_identity)); 13702c71b3e2SJacob Faibussowitsch PetscCheck(row_identity, PetscObjectComm((PetscObject)A), PETSC_ERR_SUP, "structurally symmetric matrices only supported"); 13719566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r_h)); 13729566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic_h)); 13739566063dSJacob Faibussowitsch PetscCall(ISGetSize(isicol, &nc)); 13749566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 13759566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 13768f7e8f9dSMark Adams { 13778f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 13788f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 13798f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 13808f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 13818f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_r_k(r_h, n); 13828f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_r_k("r", n); 13838f7e8f9dSMark Adams const Kokkos::View<const PetscInt *, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_ic_k(ic_h, nc); 13848f7e8f9dSMark Adams Kokkos::View<PetscInt *, Kokkos::LayoutLeft> d_ic_k("ic", nc); 13858f7e8f9dSMark Adams size_t flops_h = 0.0; 13868f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k(&flops_h); 13878f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k("flops"); 13888f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 1389da81f932SPierre Jolivet const int nloc = n / Nf, Ni = (conc > 8) ? 1 /* some intelligent number of SMs -- but need league_barrier */ : 1; 13908f7e8f9dSMark Adams Kokkos::deep_copy(d_flops_k, h_flops_k); 13918f7e8f9dSMark Adams Kokkos::deep_copy(d_r_k, h_r_k); 13928f7e8f9dSMark Adams Kokkos::deep_copy(d_ic_k, h_ic_k); 13938f7e8f9dSMark Adams // Fill A --> fact 13949371c9d4SSatish Balay Kokkos::parallel_for( 13959371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec), KOKKOS_LAMBDA(const team_member team) { 1396042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block = team.league_rank() % Ni; // use grid.x/y in CUDA 13978f7e8f9dSMark Adams const PetscInt nloc_i = (nloc / Ni + !!(nloc % Ni)), start_i = field * nloc + field_block * nloc_i, end_i = (start_i + nloc_i) > (field + 1) * nloc ? (field + 1) * nloc : (start_i + nloc_i); 13988f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 13998f7e8f9dSMark Adams // zero rows of B 14008f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 14018f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb + 1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb + 1]; // with diag 14028f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 14038f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb + 1] + 1; 14048f7e8f9dSMark Adams /* zero (unfactored row) */ 14058f7e8f9dSMark Adams for (int j = 0; j < nzbL; j++) baL[j] = 0; 14068f7e8f9dSMark Adams for (int j = 0; j < nzbU; j++) baU[j] = 0; 14078f7e8f9dSMark Adams }); 14088f7e8f9dSMark Adams // copy A into B 14098f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=](const int &rowb) { 14108f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa + 1] - ai_d[rowa]; 14118f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 14128f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 14138f7e8f9dSMark Adams /* load in initial (unfactored row) */ 14148f7e8f9dSMark Adams for (int j = 0; j < nza; j++) { 14158f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 14168f7e8f9dSMark Adams PetscScalar vala = av[j]; 14178f7e8f9dSMark Adams if (colb == rowb) { 14188f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 14198f7e8f9dSMark Adams } else { 14208f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 14218f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb + 1] + 1 : bi_d[rowb]); 14228f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb + 1] + 1) : bi_d[rowb + 1] - bi_d[rowb], set = 0; 14238f7e8f9dSMark Adams for (int j = 0; j < nz; j++) { 14248f7e8f9dSMark Adams if (pbj[j] == colb) { 14258f7e8f9dSMark Adams pba[j] = vala; 14268f7e8f9dSMark Adams set++; 14278f7e8f9dSMark Adams break; 14288f7e8f9dSMark Adams } 14298f7e8f9dSMark Adams } 14308f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 14318f7e8f9dSMark Adams if (set != 1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 14328f1da0b2SJunchao Zhang #endif 14338f7e8f9dSMark Adams } 14348f7e8f9dSMark Adams } 14358f7e8f9dSMark Adams }); 14368f7e8f9dSMark Adams }); 14378f7e8f9dSMark Adams Kokkos::fence(); 1438930e68a5SMark Adams 14399371c9d4SSatish Balay Kokkos::parallel_for( 14409371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size() + scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA(const team_member team) { 14418f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 14428f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 14438f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1444042217e8SBarry Smith const PetscInt field = team.league_rank() / Ni, field_block_idx = team.league_rank() % Ni; // use grid.x/y in CUDA 14458f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc; 14468f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 14478f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 14488f7e8f9dSMark Adams for (int ii = start; ii < end - 1; ii++) { 14498f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii + 1] + 1, nzUi = bdiag_d[ii] - (bdiag_d[ii + 1] + 1); // vector, and vector size, of column indices of U(i,(i+1):end) 14508f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii + 1] + 1; // vector of data U(i,i+1:end) 14518f7e8f9dSMark Adams const PetscInt nUi_its = nzUi / Ni + !!(nzUi % Ni); 14528f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 14538f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=](const int j) { 14548f7e8f9dSMark Adams PetscInt kIdx = j * Ni + field_block_idx; 14559371c9d4SSatish Balay if (kIdx >= nzUi) /* void */ 14569371c9d4SSatish Balay ; 14578f7e8f9dSMark Adams else { 14588f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 14598f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 14608f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk + 1] - bi_d[myk]; // size of L_k(:) 14618f7e8f9dSMark Adams size_t st_idx; 14628f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 14638f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 14648f7e8f9dSMark Adams // get column, there has got to be a better way 14659371c9d4SSatish Balay Kokkos::parallel_reduce( 14669371c9d4SSatish Balay Kokkos::ThreadVectorRange(team, nzL), 14679371c9d4SSatish Balay [&](const int &j, size_t &idx) { 14688f7e8f9dSMark Adams if (pjL[j] == ii) { 14698f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 14708f7e8f9dSMark Adams idx = j; // output 14718f7e8f9dSMark Adams *pLki = *pLki / Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 14728f7e8f9dSMark Adams } 14739371c9d4SSatish Balay }, 14749371c9d4SSatish Balay st_idx); 14759371c9d4SSatish Balay Kokkos::single(Kokkos::PerThread(team), [=]() { 14769371c9d4SSatish Balay colkIdx() = st_idx; 14779371c9d4SSatish Balay L_ki() = *(ba_d + bi_d[myk] + st_idx); 14789371c9d4SSatish Balay }); 14798f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 148099551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n", (int)myk, ii); // uses a register 148199551766SMark Adams #endif 148299551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 14838f7e8f9dSMark Adams // U(i+1,:end) 14848f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, nzUi), [=](const int &uiIdx) { // index into i (U) 14858f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 14868f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 14878f7e8f9dSMark Adams if (col == myk) { 14888f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 14898f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 14908f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 14918f7e8f9dSMark Adams } else { 14928f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv = NULL; 14938f7e8f9dSMark Adams PetscInt high, low; 14948f7e8f9dSMark Adams const PetscInt *startj; 14958f7e8f9dSMark Adams if (col < myk) { // L 14968f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 14978f7e8f9dSMark Adams PetscInt idx = (pLki + 1) - (ba_d + bi_d[myk]); // index into row 14988f7e8f9dSMark Adams start = pLki + 1; // start at pLki+1, A22(myk,1) 14998f7e8f9dSMark Adams startj = bj_d + bi_d[myk] + idx; 15008f7e8f9dSMark Adams end = ba_d + bi_d[myk + 1]; 15018f7e8f9dSMark Adams } else { 15028f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk + 1] + 1; 15038f7e8f9dSMark Adams start = ba_d + idx; 15048f7e8f9dSMark Adams startj = bj_d + idx; 15058f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 15068f7e8f9dSMark Adams } 15078f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 15088f7e8f9dSMark Adams low = 0; 15098f7e8f9dSMark Adams high = (PetscInt)(end - start); 15108f7e8f9dSMark Adams while (high - low > 5) { 15118f7e8f9dSMark Adams int t = (low + high) / 2; 15128f7e8f9dSMark Adams if (startj[t] > col) high = t; 15138f7e8f9dSMark Adams else low = t; 15148f7e8f9dSMark Adams } 15158f7e8f9dSMark Adams for (pAkjv = start + low; pAkjv < start + high; pAkjv++) { 15168f7e8f9dSMark Adams if (startj[pAkjv - start] == col) break; 15178f7e8f9dSMark Adams } 15188f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 151999551766SMark Adams if (pAkjv == start + high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n", (int)myk, (int)col); // uses a register 152099551766SMark Adams #endif 15218f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 15228f7e8f9dSMark Adams } 15238f7e8f9dSMark Adams }); 15248f7e8f9dSMark Adams } 15258f7e8f9dSMark Adams }); 15268f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 15278f7e8f9dSMark Adams if (field_block_idx == 0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add(flops.data(), (size_t)(2 * (nzUi * nzUi) + 2)); }); 15288f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 15299371c9d4SSatish Balay Kokkos::single(Kokkos::PerTeam(team), [=]() { 15309371c9d4SSatish Balay Kokkos::atomic_add(&d_flops_k(), flops()); 15319371c9d4SSatish Balay flops() = 0; 15329371c9d4SSatish Balay }); 15338f7e8f9dSMark Adams }); 15348f7e8f9dSMark Adams Kokkos::fence(); 15358f7e8f9dSMark Adams Kokkos::deep_copy(h_flops_k, d_flops_k); 15369566063dSJacob Faibussowitsch PetscCall(PetscLogGpuFlops((PetscLogDouble)h_flops_k())); 15379371c9d4SSatish Balay Kokkos::parallel_for( 15389371c9d4SSatish Balay Kokkos::TeamPolicy<>(Nf * Ni, 1, 256), KOKKOS_LAMBDA(const team_member team) { 15398f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank / Ni; //, field_offset = lg_rank%Ni; 15408f7e8f9dSMark Adams const PetscInt start = field * nloc, end = start + nloc, n_its = (nloc / Ni + !!(nloc % Ni)); // 1/Ni iters 15418f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 15428f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=](int outer_index) { 15438f7e8f9dSMark Adams int i = start + outer_index * Ni + lg_rank % Ni; 15448f7e8f9dSMark Adams if (i < end) { 15458f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 15468f7e8f9dSMark Adams *pv = 1.0 / (*pv); 15478f7e8f9dSMark Adams } 15488f7e8f9dSMark Adams }); 15498f7e8f9dSMark Adams }); 15508f7e8f9dSMark Adams } 15519566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 15529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic_h)); 15539566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r_h)); 15548f7e8f9dSMark Adams 15559566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 15569566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &col_identity)); 15578f7e8f9dSMark Adams if (b->inode.size) { 15588f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 15598f7e8f9dSMark Adams } else if (row_identity && col_identity) { 15608f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 15618f7e8f9dSMark Adams } else { 15628f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 15638f7e8f9dSMark Adams } 15648f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 15659566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncHost(B)); // solve on CPU 15668f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 15678f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 15688f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 15698f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 15708f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 15718f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 15728f7e8f9dSMark Adams 15733ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1574930e68a5SMark Adams } 1575930e68a5SMark Adams 1576d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1577d71ae5a4SJacob Faibussowitsch { 1578930e68a5SMark Adams PetscFunctionBegin; 15799566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 158086a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 15813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 158286a27549SJunchao Zhang } 158386a27549SJunchao Zhang 1584d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 1585d71ae5a4SJacob Faibussowitsch { 158686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 158786a27549SJunchao Zhang 158886a27549SJunchao Zhang PetscFunctionBegin; 158986a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 159086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d); 159186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d); 159286a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 159386a27549SJunchao Zhang } 15943ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 159586a27549SJunchao Zhang } 159686a27549SJunchao Zhang 159786a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 1598d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 1599d71ae5a4SJacob Faibussowitsch { 160086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 1601076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 160286a27549SJunchao Zhang 160386a27549SJunchao Zhang PetscFunctionBegin; 160486a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 160586a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1606076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); 160786a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d, 0); /* KK requires 0 */ 1608076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d", factors->jL_d.extent(0)); 1609076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d", factors->aL_d.extent(0)); 161086a27549SJunchao Zhang 16119371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d, factors->jL_d, factors->aL_d, 161286a27549SJunchao Zhang factors->iLt_d, factors->jLt_d, factors->aLt_d); 161386a27549SJunchao Zhang 161486a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 161586a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 161686a27549SJunchao Zhang */ 16179371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d); 161886a27549SJunchao Zhang 161986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d); 162086a27549SJunchao Zhang 162186a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1622076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); 162386a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d, 0); /* KK requires 0 */ 1624076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d", factors->jU_d.extent(0)); 1625076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d", factors->aU_d.extent(0)); 162686a27549SJunchao Zhang 16279371c9d4SSatish Balay transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d, factors->jU_d, factors->aU_d, 162886a27549SJunchao Zhang factors->iUt_d, factors->jUt_d, factors->aUt_d); 162986a27549SJunchao Zhang 163086a27549SJunchao Zhang /* Sort indices. See comments above */ 16319371c9d4SSatish Balay sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d); 163286a27549SJunchao Zhang 163386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d); 163486a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 163586a27549SJunchao Zhang } 16363ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 163786a27549SJunchao Zhang } 163886a27549SJunchao Zhang 163986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 1640d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A, Vec b, Vec x) 1641d71ae5a4SJacob Faibussowitsch { 164286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 164386a27549SJunchao Zhang PetscScalarKokkosView xv; 164486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 164586a27549SJunchao Zhang 164686a27549SJunchao Zhang PetscFunctionBegin; 16479566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16489566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSymbolicSolveCheck(A)); 16499566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 16509566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 165186a27549SJunchao Zhang /* Solve L tmpv = b */ 16529566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, bv, factors->workVector)); 165386a27549SJunchao Zhang /* Solve Ux = tmpv */ 16549566063dSJacob Faibussowitsch PetscCallCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, factors->workVector, xv)); 16559566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 16569566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 16579566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 165986a27549SJunchao Zhang } 166086a27549SJunchao Zhang 1661076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 1662d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A, Vec b, Vec x) 1663d71ae5a4SJacob Faibussowitsch { 166486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 166586a27549SJunchao Zhang PetscScalarKokkosView xv; 166686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr; 166786a27549SJunchao Zhang 166886a27549SJunchao Zhang PetscFunctionBegin; 16699566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16709566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); 16719566063dSJacob Faibussowitsch PetscCall(VecGetKokkosView(b, &bv)); 16729566063dSJacob Faibussowitsch PetscCall(VecGetKokkosViewWrite(x, &xv)); 167386a27549SJunchao Zhang /* Solve U^T tmpv = b */ 167486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, bv, factors->workVector); 167586a27549SJunchao Zhang 167686a27549SJunchao Zhang /* Solve L^T x = tmpv */ 167786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, factors->workVector, xv); 16789566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosView(b, &bv)); 16799566063dSJacob Faibussowitsch PetscCall(VecRestoreKokkosViewWrite(x, &xv)); 16809566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 16813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 168286a27549SJunchao Zhang } 168386a27549SJunchao Zhang 1684d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info) 1685d71ae5a4SJacob Faibussowitsch { 168686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr; 168786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 168886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 168986a27549SJunchao Zhang 169086a27549SJunchao Zhang PetscFunctionBegin; 16919566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeBegin()); 16929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 1693076ba34aSJunchao Zhang 1694076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1695076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1696076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1697076ba34aSJunchao Zhang 1698076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d); 169986a27549SJunchao Zhang 170086a27549SJunchao Zhang B->assembled = PETSC_TRUE; 170186a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 170286a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 170386a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 170486a27549SJunchao Zhang B->ops->matsolve = NULL; 170586a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 170686a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 170786a27549SJunchao Zhang 170886a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 170986a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 171086a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1711eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 17129566063dSJacob Faibussowitsch PetscCall(PetscLogGpuTimeEnd()); 17133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 171486a27549SJunchao Zhang } 171586a27549SJunchao Zhang 1716d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1717d71ae5a4SJacob Faibussowitsch { 171886a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 171986a27549SJunchao Zhang Mat_SeqAIJ *b; 172086a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr; 172186a27549SJunchao Zhang PetscInt fill_lev = info->levels; 172286a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU; 172386a27549SJunchao Zhang PetscInt n = A->rmap->n; 172486a27549SJunchao Zhang 172586a27549SJunchao Zhang PetscFunctionBegin; 17269566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); 172786a27549SJunchao Zhang /* Rebuild factors */ 17289371c9d4SSatish Balay if (factors) { 17299371c9d4SSatish Balay factors->Destroy(); 17309371c9d4SSatish Balay } /* Destroy the old if it exists */ 17319371c9d4SSatish Balay else { 17329371c9d4SSatish Balay B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n); 17339371c9d4SSatish Balay } 173486a27549SJunchao Zhang 173586a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 173686a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA); 173786a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU); 173886a27549SJunchao Zhang 173986a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 174086a27549SJunchao Zhang 174186a27549SJunchao Zhang Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */ 174286a27549SJunchao Zhang Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL()); 174386a27549SJunchao Zhang Kokkos::realloc(factors->iU_d, n + 1); 174486a27549SJunchao Zhang Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU()); 174586a27549SJunchao Zhang 174686a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos *)A->spptr; 1747076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1748076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1749076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d); 175086a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 175186a27549SJunchao Zhang 175286a27549SJunchao Zhang Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 175386a27549SJunchao Zhang Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU()); 175486a27549SJunchao Zhang Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */ 175586a27549SJunchao Zhang Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU()); 175686a27549SJunchao Zhang 175786a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 175886a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 175986a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 176086a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 176186a27549SJunchao Zhang #else 176286a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 176386a27549SJunchao Zhang #endif 176486a27549SJunchao Zhang 176586a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */); 176686a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */); 176786a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */); 176886a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */); 176986a27549SJunchao Zhang 177086a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 17719566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL)); 177286a27549SJunchao Zhang b = (Mat_SeqAIJ *)B->data; 177386a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU(); 177486a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 177586a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz) / ((PetscReal)nnzA); 177686a27549SJunchao Zhang 177786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 177886a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 17793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1780930e68a5SMark Adams } 1781930e68a5SMark Adams 1782d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info) 1783d71ae5a4SJacob Faibussowitsch { 17848f7e8f9dSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ *)B->data; 17858f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1786930e68a5SMark Adams 17878f7e8f9dSMark Adams PetscFunctionBegin; 17889566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); 17898f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17908f7e8f9dSMark Adams // move B data into Kokkos 17919566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(B)); // create aijkok 17929566063dSJacob Faibussowitsch PetscCall(MatSeqAIJKokkosSyncDevice(A)); // create aijkok 17938f7e8f9dSMark Adams { 17948f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr); 1795300d22a6SJunchao Zhang if (!baijkok->diag_d.extent(0)) { 17968f7e8f9dSMark Adams const Kokkos::View<PetscInt *, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged>> h_diag(b->diag, nrows + 1); 1797300d22a6SJunchao Zhang baijkok->diag_d = Kokkos::View<PetscInt *>(Kokkos::create_mirror(DefaultMemorySpace(), h_diag)); 1798300d22a6SJunchao Zhang Kokkos::deep_copy(baijkok->diag_d, h_diag); 17998f7e8f9dSMark Adams } 18008f7e8f9dSMark Adams } 18013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18028f7e8f9dSMark Adams } 18038f7e8f9dSMark Adams 1804d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A, MatSolverType *type) 1805d71ae5a4SJacob Faibussowitsch { 1806930e68a5SMark Adams PetscFunctionBegin; 1807930e68a5SMark Adams *type = MATSOLVERKOKKOS; 18083ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1809930e68a5SMark Adams } 1810930e68a5SMark Adams 1811d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A, MatSolverType *type) 1812d71ae5a4SJacob Faibussowitsch { 18138f7e8f9dSMark Adams PetscFunctionBegin; 18148f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 18153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18168f7e8f9dSMark Adams } 18178f7e8f9dSMark Adams 1818930e68a5SMark Adams /*MC 181986a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 182011a5261eSBarry Smith on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`. 1821930e68a5SMark Adams 1822930e68a5SMark Adams Level: beginner 1823930e68a5SMark Adams 1824*2ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`, `MatKokkosSetFormat()`, `MatKokkosStorageFormat`, `MatKokkosFormatOperation` 1825930e68a5SMark Adams M*/ 182686a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1827930e68a5SMark Adams { 1828930e68a5SMark Adams PetscInt n = A->rmap->n; 1829930e68a5SMark Adams 1830930e68a5SMark Adams PetscFunctionBegin; 18319566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 18329566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 1833930e68a5SMark Adams (*B)->factortype = ftype; 18349566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 18359566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 1836930e68a5SMark Adams 18378f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18389566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 183986a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 184086a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 184186a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 18429566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 184386a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 184486a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 184598921bdaSJacob Faibussowitsch } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1846930e68a5SMark Adams 18479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 18489566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos)); 18493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1850930e68a5SMark Adams } 18518f7e8f9dSMark Adams 1852d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A, MatFactorType ftype, Mat *B) 1853d71ae5a4SJacob Faibussowitsch { 18548f7e8f9dSMark Adams PetscInt n = A->rmap->n; 18558f7e8f9dSMark Adams 18568f7e8f9dSMark Adams PetscFunctionBegin; 18579566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 18589566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 18598f7e8f9dSMark Adams (*B)->factortype = ftype; 1860f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 18619566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 18629566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQAIJKOKKOS)); 18638f7e8f9dSMark Adams 18648f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18659566063dSJacob Faibussowitsch PetscCall(MatSetBlockSizesFromMats(*B, A, A)); 18668f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 18678f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported for KOKKOS Matrix Types"); 18688f7e8f9dSMark Adams 18699566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL)); 18709566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)(*B), "MatFactorGetSolverType_C", MatFactorGetSolverType_seqaij_kokkos_device)); 18713ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18728f7e8f9dSMark Adams } 187386a27549SJunchao Zhang 1874d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 1875d71ae5a4SJacob Faibussowitsch { 187686a27549SJunchao Zhang PetscFunctionBegin; 18779566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18789566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos)); 18799566063dSJacob Faibussowitsch PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_seqaijkokkos_kokkos_device)); 18803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 188186a27549SJunchao Zhang } 188286a27549SJunchao Zhang 1883076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1884d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat) 1885d71ae5a4SJacob Faibussowitsch { 1886076ba34aSJunchao Zhang const auto &iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.row_map); 1887076ba34aSJunchao Zhang const auto &jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.graph.entries); 1888076ba34aSJunchao Zhang const auto &av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), csrmat.values); 1889076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1890076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1891076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1892076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz(); 1893076ba34aSJunchao Zhang 1894076ba34aSJunchao Zhang PetscFunctionBegin; 18959566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz)); 1896076ba34aSJunchao Zhang for (PetscInt k = 0; k < m; k++) { 18979566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k)); 189848a46eb9SPierre Jolivet for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p]))); 18999566063dSJacob Faibussowitsch PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n")); 1900076ba34aSJunchao Zhang } 19013ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1902076ba34aSJunchao Zhang } 1903