111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 442550becSJunchao Zhang #include <petsc/private/sfimpl.h> 58c3ff71bSJunchao Zhang #include <petscsystypes.h> 68c3ff71bSJunchao Zhang #include <petscerror.h> 78c3ff71bSJunchao Zhang 88c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 9f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 10076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 128c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1486a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 16076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1786a27549SJunchao Zhang 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 1942550becSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp> 208c3ff71bSJunchao Zhang 218c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 228c3ff71bSJunchao Zhang 23076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 24076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 25076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 26076ba34aSJunchao Zhang */ 278c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 288c3ff71bSJunchao Zhang { 298c3ff71bSJunchao Zhang PetscErrorCode ierr; 30076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 31076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 328c3ff71bSJunchao Zhang 338c3ff71bSJunchao Zhang PetscFunctionBegin; 34076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 358c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 36076ba34aSJunchao Zhang 37076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 38076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 39076ba34aSJunchao Zhang 40076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 41076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 42076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 43076ba34aSJunchao Zhang */ 44076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 45076ba34aSJunchao Zhang delete aijkok; 46076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 47076ba34aSJunchao Zhang A->spptr = aijkok; 48076ba34aSJunchao Zhang } 49076ba34aSJunchao Zhang 50a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 51a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 52a587d139SMark } 538c3ff71bSJunchao Zhang PetscFunctionReturn(0); 548c3ff71bSJunchao Zhang } 558c3ff71bSJunchao Zhang 5686a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 57076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 588c3ff71bSJunchao Zhang { 598c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 608c3ff71bSJunchao Zhang 618c3ff71bSJunchao Zhang PetscFunctionBegin; 6286a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 63076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 64076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 65076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 66076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 67076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6886a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 698c3ff71bSJunchao Zhang } 708c3ff71bSJunchao Zhang PetscFunctionReturn(0); 718c3ff71bSJunchao Zhang } 728c3ff71bSJunchao Zhang 73076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 74fc76dfabSMark Adams PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7586a27549SJunchao Zhang { 7686a27549SJunchao Zhang PetscErrorCode ierr; 7786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7886a27549SJunchao Zhang 7986a27549SJunchao Zhang PetscFunctionBegin; 8086a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8186a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8286a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8386a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8486a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 852328674fSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); 8686a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8786a27549SJunchao Zhang PetscFunctionReturn(0); 8886a27549SJunchao Zhang } 8986a27549SJunchao Zhang 90f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 91f0cf5187SStefano Zampini { 92f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 93f0cf5187SStefano Zampini 94f0cf5187SStefano Zampini PetscFunctionBegin; 95f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9686a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9786a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 98f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 99076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 100f0cf5187SStefano Zampini PetscFunctionReturn(0); 101f0cf5187SStefano Zampini } 102f0cf5187SStefano Zampini 103f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 104f0cf5187SStefano Zampini { 105076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 106f0cf5187SStefano Zampini 107f0cf5187SStefano Zampini PetscFunctionBegin; 108076ba34aSJunchao Zhang if (aijkok) { 109076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 110076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 111076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 112076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 113076ba34aSJunchao Zhang } 114076ba34aSJunchao Zhang PetscFunctionReturn(0); 115076ba34aSJunchao Zhang } 116076ba34aSJunchao Zhang 117076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 118076ba34aSJunchao Zhang { 119076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 120076ba34aSJunchao Zhang 121076ba34aSJunchao Zhang PetscFunctionBegin; 122076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 123076ba34aSJunchao Zhang PetscFunctionReturn(0); 124076ba34aSJunchao Zhang } 125076ba34aSJunchao Zhang 126076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 127076ba34aSJunchao Zhang { 128076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 129076ba34aSJunchao Zhang 130076ba34aSJunchao Zhang PetscFunctionBegin; 1312328674fSJunchao Zhang if (aijkok) { 132076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 133076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1342328674fSJunchao Zhang } else { 1352328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1362328674fSJunchao Zhang } 137076ba34aSJunchao Zhang PetscFunctionReturn(0); 138076ba34aSJunchao Zhang } 139076ba34aSJunchao Zhang 140076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 141076ba34aSJunchao Zhang { 142076ba34aSJunchao Zhang PetscFunctionBegin; 143076ba34aSJunchao Zhang *array = NULL; 144076ba34aSJunchao Zhang PetscFunctionReturn(0); 145076ba34aSJunchao Zhang } 146076ba34aSJunchao Zhang 147076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 148076ba34aSJunchao Zhang { 149076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 150076ba34aSJunchao Zhang 151076ba34aSJunchao Zhang PetscFunctionBegin; 1522328674fSJunchao Zhang if (aijkok) { 153076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 1542328674fSJunchao Zhang } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */ 1552328674fSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 1562328674fSJunchao Zhang } 157076ba34aSJunchao Zhang PetscFunctionReturn(0); 158076ba34aSJunchao Zhang } 159076ba34aSJunchao Zhang 160076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 161076ba34aSJunchao Zhang { 162076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 163076ba34aSJunchao Zhang 164076ba34aSJunchao Zhang PetscFunctionBegin; 1652328674fSJunchao Zhang if (aijkok) { 166076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 167076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 1682328674fSJunchao Zhang } 169f0cf5187SStefano Zampini PetscFunctionReturn(0); 170f0cf5187SStefano Zampini } 171f0cf5187SStefano Zampini 172a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 173042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 174a587d139SMark { 175042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 176042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 177a587d139SMark 178a587d139SMark PetscFunctionBegin; 179076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 180152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 181a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 182a587d139SMark PetscFunctionReturn(0); 183a587d139SMark } 184a587d139SMark 185a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 186042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 187a587d139SMark { 188042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 189a587d139SMark 190a587d139SMark PetscFunctionBegin; 191a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 192a587d139SMark *d_mat = aijkok->device_mat_d.data(); 193a587d139SMark } else { 194a587d139SMark PetscErrorCode ierr; 195a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 196a587d139SMark *d_mat = NULL; 197a587d139SMark } 198a587d139SMark PetscFunctionReturn(0); 199a587d139SMark } 200076ba34aSJunchao Zhang 201076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 202076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 203152b3e56SJunchao Zhang { 204152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 205152b3e56SJunchao Zhang 206152b3e56SJunchao Zhang PetscFunctionBegin; 207076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 208076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 209076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 210076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 211076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 212076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 21386a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 214076ba34aSJunchao Zhang } 215076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 216152b3e56SJunchao Zhang PetscFunctionReturn(0); 217152b3e56SJunchao Zhang } 218152b3e56SJunchao Zhang 219076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 220076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 221152b3e56SJunchao Zhang { 222eeadb341SJunchao Zhang PetscErrorCode ierr; 223152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 224152b3e56SJunchao Zhang 225152b3e56SJunchao Zhang PetscFunctionBegin; 226eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 227076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 228076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 229076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 230076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 231076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 232076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 233076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 234076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 235076ba34aSJunchao Zhang #endif 23686a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 237076ba34aSJunchao Zhang } 238076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 239eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 240152b3e56SJunchao Zhang PetscFunctionReturn(0); 241152b3e56SJunchao Zhang } 242a587d139SMark 2438c3ff71bSJunchao Zhang /* y = A x */ 2448c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2458c3ff71bSJunchao Zhang { 2468c3ff71bSJunchao Zhang PetscErrorCode ierr; 2478c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 248152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 249152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2508c3ff71bSJunchao Zhang 2518c3ff71bSJunchao Zhang PetscFunctionBegin; 2528c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 253152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 254152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2558c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 256eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 257152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 258152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 259152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 260076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 261152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2638c3ff71bSJunchao Zhang } 2648c3ff71bSJunchao Zhang 2658c3ff71bSJunchao Zhang /* y = A^T x */ 2668c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2678c3ff71bSJunchao Zhang { 2688c3ff71bSJunchao Zhang PetscErrorCode ierr; 2698c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 270152b3e56SJunchao Zhang const char *mode; 271152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 272152b3e56SJunchao Zhang PetscScalarKokkosView yv; 273076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2748c3ff71bSJunchao Zhang 2758c3ff71bSJunchao Zhang PetscFunctionBegin; 276eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 278152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 279152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 280152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 281076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 282152b3e56SJunchao Zhang mode = "N"; 283152b3e56SJunchao Zhang } else { 284076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 285076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 286152b3e56SJunchao Zhang mode = "T"; 287152b3e56SJunchao Zhang } 288076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 289152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 290152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 291076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 292eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 2938c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2948c3ff71bSJunchao Zhang } 2958c3ff71bSJunchao Zhang 2968c3ff71bSJunchao Zhang /* y = A^H x */ 2978c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2988c3ff71bSJunchao Zhang { 2998c3ff71bSJunchao Zhang PetscErrorCode ierr; 3008c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 301152b3e56SJunchao Zhang const char *mode; 302152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 303152b3e56SJunchao Zhang PetscScalarKokkosView yv; 304076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3058c3ff71bSJunchao Zhang 3068c3ff71bSJunchao Zhang PetscFunctionBegin; 307eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3088c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 309152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 310152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 311152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 312076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 313152b3e56SJunchao Zhang mode = "N"; 314152b3e56SJunchao Zhang } else { 315076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 316076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 317152b3e56SJunchao Zhang mode = "C"; 318152b3e56SJunchao Zhang } 319076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 320152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 321152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 322076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 323eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3248c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3258c3ff71bSJunchao Zhang } 3268c3ff71bSJunchao Zhang 3278c3ff71bSJunchao Zhang /* z = A x + y */ 3288c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3298c3ff71bSJunchao Zhang { 3308c3ff71bSJunchao Zhang PetscErrorCode ierr; 3318c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 332152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 333152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3348c3ff71bSJunchao Zhang 3358c3ff71bSJunchao Zhang PetscFunctionBegin; 336eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 338152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 339152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 340152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3418c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3428c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 343152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 344152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 345152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 348eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3508c3ff71bSJunchao Zhang } 3518c3ff71bSJunchao Zhang 3528c3ff71bSJunchao Zhang /* z = A^T x + y */ 3538c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3548c3ff71bSJunchao Zhang { 3558c3ff71bSJunchao Zhang PetscErrorCode ierr; 3568c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 357152b3e56SJunchao Zhang const char *mode; 358152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 359152b3e56SJunchao Zhang PetscScalarKokkosView zv; 360076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3618c3ff71bSJunchao Zhang 3628c3ff71bSJunchao Zhang PetscFunctionBegin; 363eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 365152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 366152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 367152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3688c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 369152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 370076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 371152b3e56SJunchao Zhang mode = "N"; 372152b3e56SJunchao Zhang } else { 373076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 374076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 375152b3e56SJunchao Zhang mode = "T"; 376152b3e56SJunchao Zhang } 377076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 378152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 380152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 381076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 382eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 3838c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3848c3ff71bSJunchao Zhang } 3858c3ff71bSJunchao Zhang 3868c3ff71bSJunchao Zhang /* z = A^H x + y */ 3878c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3888c3ff71bSJunchao Zhang { 3898c3ff71bSJunchao Zhang PetscErrorCode ierr; 3908c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 391152b3e56SJunchao Zhang const char *mode; 392152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 393152b3e56SJunchao Zhang PetscScalarKokkosView zv; 394076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3958c3ff71bSJunchao Zhang 3968c3ff71bSJunchao Zhang PetscFunctionBegin; 397eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 3988c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 399152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 400152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 401152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 4028c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 403152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 404076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 405152b3e56SJunchao Zhang mode = "N"; 406152b3e56SJunchao Zhang } else { 407076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 408076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 409152b3e56SJunchao Zhang mode = "C"; 410152b3e56SJunchao Zhang } 411076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 412152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 413152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 414152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 415076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 416eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 417152b3e56SJunchao Zhang PetscFunctionReturn(0); 418152b3e56SJunchao Zhang } 419152b3e56SJunchao Zhang 420152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 421152b3e56SJunchao Zhang { 422152b3e56SJunchao Zhang PetscErrorCode ierr; 423152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 424152b3e56SJunchao Zhang 425152b3e56SJunchao Zhang PetscFunctionBegin; 426152b3e56SJunchao Zhang switch (op) { 427152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 428152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 429152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 430152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 431152b3e56SJunchao Zhang break; 432152b3e56SJunchao Zhang default: 433152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 434152b3e56SJunchao Zhang break; 435152b3e56SJunchao Zhang } 4368c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4378c3ff71bSJunchao Zhang } 4388c3ff71bSJunchao Zhang 439076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4403d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4418c3ff71bSJunchao Zhang { 4428c3ff71bSJunchao Zhang PetscErrorCode ierr; 443076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4448c3ff71bSJunchao Zhang 4458c3ff71bSJunchao Zhang PetscFunctionBegin; 446a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 447076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 448076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4498c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 450076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 451076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 452076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 453076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 454076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 455076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 456076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 457076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 458076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 459076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 460076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4618c3ff71bSJunchao Zhang } 462076ba34aSJunchao Zhang } 4638c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4648c3ff71bSJunchao Zhang } 4658c3ff71bSJunchao Zhang 466076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 467076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 468076ba34aSJunchao Zhang */ 469076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4708c3ff71bSJunchao Zhang { 4718c3ff71bSJunchao Zhang PetscErrorCode ierr; 472076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 473076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 474076ba34aSJunchao Zhang Mat mat; 4758c3ff71bSJunchao Zhang 4768c3ff71bSJunchao Zhang PetscFunctionBegin; 477076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 478076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 479076ba34aSJunchao Zhang mat = *B; 480076ba34aSJunchao Zhang if (A->assembled) { 481076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 482076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 483076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 484076ba34aSJunchao Zhang /* Now copy values to B if needed */ 485076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 486076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 487076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 488076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 489076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 490076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 491076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 492076ba34aSJunchao Zhang } 493076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 494076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 495076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 496076ba34aSJunchao Zhang } 497076ba34aSJunchao Zhang mat->spptr = bkok; 498076ba34aSJunchao Zhang } 499076ba34aSJunchao Zhang 500076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 501076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 502076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 503076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 5048c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5058c3ff71bSJunchao Zhang } 5068c3ff71bSJunchao Zhang 5070ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 5080ecb592aSJunchao Zhang { 5090ecb592aSJunchao Zhang PetscErrorCode ierr; 5100ecb592aSJunchao Zhang Mat At; 5110ecb592aSJunchao Zhang KokkosCsrMatrix *internT,*csrmatT; 5120ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 5130ecb592aSJunchao Zhang 5140ecb592aSJunchao Zhang PetscFunctionBegin; 5150ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 5160ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 5170ecb592aSJunchao Zhang CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 5180ecb592aSJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 5190ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 5200ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 5217a3b1c56SStefano Zampini else {ierr = MatHeaderReplace(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 5220ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 5230ecb592aSJunchao Zhang if ((*B)->assembled) { 5240ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 5250ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 5260ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 5270ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 5280ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 5290ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 5300ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 5310ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 5320ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 5330ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 5340ecb592aSJunchao Zhang } 5350ecb592aSJunchao Zhang PetscFunctionReturn(0); 5360ecb592aSJunchao Zhang } 5370ecb592aSJunchao Zhang 5388c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5398c3ff71bSJunchao Zhang { 5408c3ff71bSJunchao Zhang PetscErrorCode ierr; 54186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5428c3ff71bSJunchao Zhang 5438c3ff71bSJunchao Zhang PetscFunctionBegin; 54486a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 54586a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5468f7e8f9dSMark Adams if (aijkok) { 5478f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 548a587d139SMark delete aijkok->colmap_d; 549a587d139SMark delete aijkok->i_uncompressed_d; 550a587d139SMark } 5518f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5528f7e8f9dSMark Adams } 5538c3ff71bSJunchao Zhang delete aijkok; 55486a27549SJunchao Zhang } else { 55586a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 55686a27549SJunchao Zhang } 557152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 55842550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 55942550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", NULL);CHKERRQ(ierr); 560152b3e56SJunchao Zhang A->spptr = NULL; 5618c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5628c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5638c3ff71bSJunchao Zhang } 5648c3ff71bSJunchao Zhang 56586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 56686a27549SJunchao Zhang { 56786a27549SJunchao Zhang PetscErrorCode ierr; 56886a27549SJunchao Zhang 56986a27549SJunchao Zhang PetscFunctionBegin; 57086a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 57186a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 57286a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 57386a27549SJunchao Zhang PetscFunctionReturn(0); 57486a27549SJunchao Zhang } 57586a27549SJunchao Zhang 576076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 577076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 578a3f881fbSStefano Zampini { 579076ba34aSJunchao Zhang PetscErrorCode ierr; 580076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 581076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 582076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 583076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 584076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 585076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 586a3f881fbSStefano Zampini 587a3f881fbSStefano Zampini PetscFunctionBegin; 588076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 589076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 590076ba34aSJunchao Zhang PetscValidPointer(C,4); 591076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 592076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 593c0aa6a63SJacob Faibussowitsch if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT,A->rmap->n,B->rmap->n); 594076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 595076ba34aSJunchao Zhang 596076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 597076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 598076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 599076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 600076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 601076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 602076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 603076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 604076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 605076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 606076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 607076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 608076ba34aSJunchao Zhang nnz = a->nz + b->nz; 609076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 610076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 611076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 612076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 613076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 614076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 615076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 616076ba34aSJunchao Zhang ca = ca_dual.view_device(); 617076ba34aSJunchao Zhang ci = ci_dual.view_device(); 618076ba34aSJunchao Zhang cj = cj_dual.view_device(); 619076ba34aSJunchao Zhang 620076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 621076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 622076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 623076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 624076ba34aSJunchao Zhang 625076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 626076ba34aSJunchao Zhang ci(i) = coffset; 627076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 628076ba34aSJunchao Zhang }); 629076ba34aSJunchao Zhang 630076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 631076ba34aSJunchao Zhang if (k < alen) { 632076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 633076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 634076ba34aSJunchao Zhang } else { 635076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 636076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 637076ba34aSJunchao Zhang } 638076ba34aSJunchao Zhang }); 639076ba34aSJunchao Zhang }); 640076ba34aSJunchao Zhang ca_dual.modify_device(); 641076ba34aSJunchao Zhang ci_dual.modify_device(); 642076ba34aSJunchao Zhang cj_dual.modify_device(); 643076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 644076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 645076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 646076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 647076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 648076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 649076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 650076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 651076ba34aSJunchao Zhang 652076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 653076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 654076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 655076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 656076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 657076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 658076ba34aSJunchao Zhang }); 659076ba34aSJunchao Zhang }); 660076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 661076ba34aSJunchao Zhang } 662076ba34aSJunchao Zhang PetscFunctionReturn(0); 663076ba34aSJunchao Zhang } 664076ba34aSJunchao Zhang 665076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 666076ba34aSJunchao Zhang { 667076ba34aSJunchao Zhang PetscFunctionBegin; 668076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 669a3f881fbSStefano Zampini PetscFunctionReturn(0); 670a3f881fbSStefano Zampini } 671a3f881fbSStefano Zampini 672a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 673a3f881fbSStefano Zampini { 674076ba34aSJunchao Zhang PetscErrorCode ierr; 675a3f881fbSStefano Zampini Mat_Product *product = C->product; 676a3f881fbSStefano Zampini Mat A,B; 677076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 678a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 679a3f881fbSStefano Zampini Mat_SeqAIJ *c; 680076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 681076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 682a3f881fbSStefano Zampini 683a3f881fbSStefano Zampini PetscFunctionBegin; 684a3f881fbSStefano Zampini MatCheckProduct(C,1); 685a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 686076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 687076ba34aSJunchao Zhang 688076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 689076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 690076ba34aSJunchao Zhang PetscFunctionReturn(0); 691076ba34aSJunchao Zhang } 692076ba34aSJunchao Zhang 693076ba34aSJunchao Zhang switch (product->type) { 694076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 695076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 696076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 697076ba34aSJunchao Zhang default: 698076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 699076ba34aSJunchao Zhang } 700076ba34aSJunchao Zhang 701a3f881fbSStefano Zampini A = product->A; 702a3f881fbSStefano Zampini B = product->B; 703a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 704a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 705a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 706a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 707a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 708076ba34aSJunchao Zhang 709076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 710076ba34aSJunchao Zhang 711076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 712076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 713076ba34aSJunchao Zhang 714076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 715076ba34aSJunchao Zhang if (transA) { 716076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 717076ba34aSJunchao Zhang transA = false; 718a3f881fbSStefano Zampini } 719a3f881fbSStefano Zampini 720076ba34aSJunchao Zhang if (transB) { 721076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 722076ba34aSJunchao Zhang transB = false; 723076ba34aSJunchao Zhang } 724eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 725076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 726076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 727eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 728076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 729a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 730a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 731c0aa6a63SJacob Faibussowitsch ierr = PetscInfo3(C,"Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded,%" PetscInt_FMT " used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 732a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 733c0aa6a63SJacob Faibussowitsch ierr = PetscInfo1(C,"Maximum nonzeros in any row is %" PetscInt_FMT "\n",c->rmax);CHKERRQ(ierr); 734a3f881fbSStefano Zampini c->reallocs = 0; 735076ba34aSJunchao Zhang C->info.mallocs = 0; 736a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 737a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 738a3f881fbSStefano Zampini C->num_ass++; 739a3f881fbSStefano Zampini PetscFunctionReturn(0); 740a3f881fbSStefano Zampini } 741a3f881fbSStefano Zampini 742a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 743a3f881fbSStefano Zampini { 744a3f881fbSStefano Zampini PetscErrorCode ierr; 745076ba34aSJunchao Zhang Mat_Product *product = C->product; 746076ba34aSJunchao Zhang MatProductType ptype; 747076ba34aSJunchao Zhang Mat A,B; 748076ba34aSJunchao Zhang bool transA,transB; 749076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 750076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 751076ba34aSJunchao Zhang MPI_Comm comm; 752076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 753a3f881fbSStefano Zampini 754a3f881fbSStefano Zampini PetscFunctionBegin; 755a3f881fbSStefano Zampini MatCheckProduct(C,1); 756076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 757076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 758a3f881fbSStefano Zampini A = product->A; 759a3f881fbSStefano Zampini B = product->B; 760a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 761a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 762a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 763a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 764076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 765076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 766076ba34aSJunchao Zhang 767a3f881fbSStefano Zampini ptype = product->type; 768a3f881fbSStefano Zampini switch (ptype) { 769076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 770076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 771076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 772a3f881fbSStefano Zampini default: 773076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 774a3f881fbSStefano Zampini } 775a3f881fbSStefano Zampini 776076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 777076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 778076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 779076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 780a3f881fbSStefano Zampini 781076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 782076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 7836ffb9508SJunchao Zhang /* CUDA-10.2's spgemm has bugs. As as of 2022-01-19, KK does not support CUDA-11.x's newer spgemm API. 7846ffb9508SJunchao Zhang We can default to SPGEMMAlgorithm::SPGEMM_CUSPARSE with CUDA-11+ when KK adds the support. 7856ffb9508SJunchao Zhang */ 786076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 787076ba34aSJunchao Zhang 788eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 789076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 790076ba34aSJunchao Zhang if (transA) { 791076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 792076ba34aSJunchao Zhang transA = false; 793076ba34aSJunchao Zhang } 794076ba34aSJunchao Zhang 795076ba34aSJunchao Zhang if (transB) { 796076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 797076ba34aSJunchao Zhang transB = false; 798076ba34aSJunchao Zhang } 799076ba34aSJunchao Zhang 800076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 801076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 802076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 803076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 804076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 805076ba34aSJunchao Zhang */ 806076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 807076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 808eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 809076ba34aSJunchao Zhang 810076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 811076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 812076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 813a3f881fbSStefano Zampini PetscFunctionReturn(0); 814a3f881fbSStefano Zampini } 815a3f881fbSStefano Zampini 816a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 817076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 818a3f881fbSStefano Zampini { 819a3f881fbSStefano Zampini PetscErrorCode ierr; 820076ba34aSJunchao Zhang Mat_Product *product = mat->product; 821a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 822a3f881fbSStefano Zampini 823a3f881fbSStefano Zampini PetscFunctionBegin; 824a3f881fbSStefano Zampini MatCheckProduct(mat,1); 825a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 826a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 827a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 828a3f881fbSStefano Zampini } 829a3f881fbSStefano Zampini if (Biskok && Ciskok) { 830a3f881fbSStefano Zampini switch (product->type) { 831a3f881fbSStefano Zampini case MATPRODUCT_AB: 832a3f881fbSStefano Zampini case MATPRODUCT_AtB: 833a3f881fbSStefano Zampini case MATPRODUCT_ABt: 834a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 835a3f881fbSStefano Zampini break; 836a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 837a3f881fbSStefano Zampini case MATPRODUCT_RARt: 838a3f881fbSStefano Zampini case MATPRODUCT_ABC: 839a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 840a3f881fbSStefano Zampini break; 841a3f881fbSStefano Zampini default: 842a3f881fbSStefano Zampini break; 843a3f881fbSStefano Zampini } 844a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 845a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 846a3f881fbSStefano Zampini } 847a3f881fbSStefano Zampini PetscFunctionReturn(0); 848a3f881fbSStefano Zampini } 849a587d139SMark 850f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 851f0cf5187SStefano Zampini { 852f0cf5187SStefano Zampini PetscErrorCode ierr; 853f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 854f0cf5187SStefano Zampini 855f0cf5187SStefano Zampini PetscFunctionBegin; 856eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 857f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 858f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 859076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 860076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 861076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 862eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 863f0cf5187SStefano Zampini PetscFunctionReturn(0); 864f0cf5187SStefano Zampini } 865f0cf5187SStefano Zampini 866a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 867a587d139SMark { 868a587d139SMark PetscErrorCode ierr; 869076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 870a587d139SMark 871a587d139SMark PetscFunctionBegin; 872076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 8732328674fSJunchao Zhang if (aijkok) { /* Only zero the device if data is already there */ 874076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 875076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 8762328674fSJunchao Zhang } else { /* Might be preallocated but not assembled */ 8772328674fSJunchao Zhang ierr = MatZeroEntries_SeqAIJ(A);CHKERRQ(ierr); 8782328674fSJunchao Zhang } 879a587d139SMark PetscFunctionReturn(0); 880a587d139SMark } 881a587d139SMark 882db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 88342550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView* kv) 884db78de30SJunchao Zhang { 885db78de30SJunchao Zhang PetscErrorCode ierr; 886db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 887db78de30SJunchao Zhang 888db78de30SJunchao Zhang PetscFunctionBegin; 889db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 890db78de30SJunchao Zhang PetscValidPointer(kv,2); 891db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 892db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 893db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 894076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 895db78de30SJunchao Zhang PetscFunctionReturn(0); 896db78de30SJunchao Zhang } 897db78de30SJunchao Zhang 89842550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView* kv) 899db78de30SJunchao Zhang { 900db78de30SJunchao Zhang PetscFunctionBegin; 901db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 902db78de30SJunchao Zhang PetscValidPointer(kv,2); 903db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 904db78de30SJunchao Zhang PetscFunctionReturn(0); 905db78de30SJunchao Zhang } 906db78de30SJunchao Zhang 90742550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView* kv) 908db78de30SJunchao Zhang { 909db78de30SJunchao Zhang PetscErrorCode ierr; 910db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 911db78de30SJunchao Zhang 912db78de30SJunchao Zhang PetscFunctionBegin; 913db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 914db78de30SJunchao Zhang PetscValidPointer(kv,2); 915db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 916db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 917db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 919db78de30SJunchao Zhang PetscFunctionReturn(0); 920db78de30SJunchao Zhang } 921db78de30SJunchao Zhang 92242550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView* kv) 923db78de30SJunchao Zhang { 924db78de30SJunchao Zhang PetscErrorCode ierr; 925db78de30SJunchao Zhang 926db78de30SJunchao Zhang PetscFunctionBegin; 927db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928db78de30SJunchao Zhang PetscValidPointer(kv,2); 929db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931db78de30SJunchao Zhang PetscFunctionReturn(0); 932db78de30SJunchao Zhang } 933db78de30SJunchao Zhang 93442550becSJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 935db78de30SJunchao Zhang { 936db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 937db78de30SJunchao Zhang 938db78de30SJunchao Zhang PetscFunctionBegin; 939db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 940db78de30SJunchao Zhang PetscValidPointer(kv,2); 941db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 942db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 943076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 944db78de30SJunchao Zhang PetscFunctionReturn(0); 945db78de30SJunchao Zhang } 946db78de30SJunchao Zhang 94742550becSJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView* kv) 948db78de30SJunchao Zhang { 949db78de30SJunchao Zhang PetscErrorCode ierr; 950db78de30SJunchao Zhang 951db78de30SJunchao Zhang PetscFunctionBegin; 952db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 953db78de30SJunchao Zhang PetscValidPointer(kv,2); 954db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 955076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 956db78de30SJunchao Zhang PetscFunctionReturn(0); 957db78de30SJunchao Zhang } 958db78de30SJunchao Zhang 959c17cf699SJunchao Zhang /* Computes Y += alpha X */ 960c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 961a587d139SMark { 962a587d139SMark PetscErrorCode ierr; 963a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 964c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 965c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 966c17cf699SJunchao Zhang MatScalarKokkosView Ya; 967a587d139SMark 968a587d139SMark PetscFunctionBegin; 969c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 970c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 971c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 972c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 973eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 974db78de30SJunchao Zhang 975c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 976c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 977a587d139SMark PetscBool e; 978a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 979a587d139SMark if (e) { 980a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 981c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 982a587d139SMark } 983a587d139SMark } 984db78de30SJunchao Zhang 985c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 986c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 987c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 988c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 989c17cf699SJunchao Zhang */ 990c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 991c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 992c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 993c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 994c17cf699SJunchao Zhang 995c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 996c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 997c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 998c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 999c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 1000c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 1001c17cf699SJunchao Zhang 1002c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 1003c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 1004c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 1005c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 1006c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 1007c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 1008c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 1009c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 1010c17cf699SJunchao Zhang q++; 1011a587d139SMark } else { 1012c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 1013c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 1014c17cf699SJunchao Zhang */ 1015c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 1016a587d139SMark } 1017c17cf699SJunchao Zhang } 1018c17cf699SJunchao Zhang }); 1019c17cf699SJunchao Zhang }); 1020c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 1021c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 1022c17cf699SJunchao Zhang Mat Z; 1023c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 1024c17cf699SJunchao Zhang KernelHandle kh; 1025c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1026c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1027c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1028c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1029c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1030c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1031c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1032c17cf699SJunchao Zhang } 1033eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1034eeadb341SJunchao Zhang ierr = PetscLogGpuFlops(xkok->a_dual.extent(0)*2);CHKERRQ(ierr); /* Because we scaled X and then added it to Y */ 1035a587d139SMark PetscFunctionReturn(0); 1036a587d139SMark } 1037a587d139SMark 103842550becSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 103942550becSJunchao Zhang { 104042550becSJunchao Zhang PetscErrorCode ierr; 104142550becSJunchao Zhang MPI_Comm comm; 104242550becSJunchao Zhang PetscInt *i,*j,*perm,*jmap; 104342550becSJunchao Zhang PetscInt k,M,N,p,q,row,start,end,nnz,nneg; 104442550becSJunchao Zhang PetscBool has_repeats = PETSC_FALSE; 104542550becSJunchao Zhang PetscInt *Ai,*Aj; 104642550becSJunchao Zhang PetscScalar *Aa; 104742550becSJunchao Zhang Mat_SeqAIJKokkos *akok; 104842550becSJunchao Zhang Mat_SeqAIJ *aseq; 104942550becSJunchao Zhang MatRowMapKokkosViewHost perm_h("perm",n),jmap_h; 105042550becSJunchao Zhang Mat newmat; 105142550becSJunchao Zhang 105242550becSJunchao Zhang PetscFunctionBegin; 105342550becSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)mat,&comm);CHKERRQ(ierr); 105442550becSJunchao Zhang ierr = MatGetSize(mat,&M,&N);CHKERRQ(ierr); 105542550becSJunchao Zhang ierr = PetscMalloc2(n,&i,n,&j);CHKERRQ(ierr); 105642550becSJunchao Zhang ierr = PetscArraycpy(i,coo_i,n);CHKERRQ(ierr); /* Make a copy since we'll modify it */ 105742550becSJunchao Zhang ierr = PetscArraycpy(j,coo_j,n);CHKERRQ(ierr); 105842550becSJunchao Zhang perm = perm_h.data(); 105942550becSJunchao Zhang for (k=0; k<n; k++) { /* Ignore entries with negative row or col indices */ 106042550becSJunchao Zhang if (j[k] < 0) i[k] = -1; 106142550becSJunchao Zhang perm[k] = k; 106242550becSJunchao Zhang } 106342550becSJunchao Zhang 106442550becSJunchao Zhang /* Sort by row */ 106542550becSJunchao Zhang ierr = PetscSortIntWithArrayPair(n,i,j,perm);CHKERRQ(ierr); 106642550becSJunchao Zhang for (k=0; k<n; k++) {if (i[k] >= 0) break;} /* Advance k to the first row with a non-negative index */ 106742550becSJunchao Zhang nneg = k; 106842550becSJunchao Zhang Kokkos::resize(jmap_h,n-nneg+1); /* Allocate an extra to make a CSR-like data structure. jmap[i] is the number of repeats for i-th nonzero */ 106942550becSJunchao Zhang nnz = 0; /* Total number of unique nonzeros to be counted */ 107042550becSJunchao Zhang jmap = jmap_h.data(); /* In the end, only the first nnz+1 elements in jmap[] are significant. */ 107142550becSJunchao Zhang jmap++; /* Inc jmap by 1 for convinience */ 107242550becSJunchao Zhang 107342550becSJunchao Zhang ierr = PetscCalloc1(M+1,&Ai);CHKERRQ(ierr); /* CSR of A */ 107442550becSJunchao Zhang ierr = PetscMalloc1(n-nneg,&Aj);CHKERRQ(ierr); /* We have at most n-k unique nonzeros */ 107542550becSJunchao Zhang 107642550becSJunchao Zhang /* In each row, sort by column, then unique column indices to get row length */ 107742550becSJunchao Zhang Ai++; /* Inc by 1 for convinience */ 107842550becSJunchao Zhang q = 0; /* q-th unique nonzero, with q starting from 0 */ 107942550becSJunchao Zhang while (k<n) { 108042550becSJunchao Zhang row = i[k]; 108142550becSJunchao Zhang start = k; /* [start,end) indices for this row */ 108242550becSJunchao Zhang while (k<n && i[k] == row) k++; 108342550becSJunchao Zhang end = k; 108442550becSJunchao Zhang ierr = PetscSortIntWithArray(end-start,j+start,perm+start);CHKERRQ(ierr); 108542550becSJunchao Zhang /* Find number of unique col entries in this row */ 108642550becSJunchao Zhang Aj[q] = j[start]; /* Log the first nonzero in this row */ 108742550becSJunchao Zhang jmap[q] = 1; /* Number of repeats of this nozero entry */ 108842550becSJunchao Zhang Ai[row] = 1; 108942550becSJunchao Zhang nnz++; 109042550becSJunchao Zhang 109142550becSJunchao Zhang for (p=start+1; p<end; p++) { /* Scan remaining nonzero in this row */ 109242550becSJunchao Zhang if (j[p] != j[p-1]) { /* Meet a new nonzero */ 109342550becSJunchao Zhang q++; 109442550becSJunchao Zhang jmap[q] = 1; 109542550becSJunchao Zhang Aj[q] = j[p]; 109642550becSJunchao Zhang Ai[row]++; 109742550becSJunchao Zhang nnz++; 109842550becSJunchao Zhang } else { 109942550becSJunchao Zhang jmap[q]++; 110042550becSJunchao Zhang has_repeats = PETSC_TRUE; 110142550becSJunchao Zhang } 110242550becSJunchao Zhang } 110342550becSJunchao Zhang q++; /* Move to next row and thus next unique nonzero */ 110442550becSJunchao Zhang } 110542550becSJunchao Zhang ierr = PetscFree2(i,j);CHKERRQ(ierr); 110642550becSJunchao Zhang 110742550becSJunchao Zhang Ai--; /* Back to the beginning of Ai[] */ 110842550becSJunchao Zhang for (k=0; k<M; k++) Ai[k+1] += Ai[k]; 110942550becSJunchao Zhang jmap--; /* Back to the beginning of jmap[] */ 111042550becSJunchao Zhang jmap[0] = 0; 111142550becSJunchao Zhang if (has_repeats) { /* Only transform jmap[] to CSR when having repeats, otherwise jmap[] is not used */ 111242550becSJunchao Zhang for (k=0; k<nnz; k++) jmap[k+1] += jmap[k]; 111342550becSJunchao Zhang Kokkos::resize(jmap_h,nnz+1); /* Resize wrt the actual number of nonzeros */ 111442550becSJunchao Zhang } 111542550becSJunchao Zhang 111642550becSJunchao Zhang if (nnz < n-nneg) { /* Realloc Aj[] to actual number of nonzeros */ 111742550becSJunchao Zhang PetscInt *Aj_new; 111842550becSJunchao Zhang ierr = PetscMalloc1(nnz,&Aj_new);CHKERRQ(ierr); 111942550becSJunchao Zhang ierr = PetscArraycpy(Aj_new,Aj,nnz);CHKERRQ(ierr); 112042550becSJunchao Zhang ierr = PetscFree(Aj);CHKERRQ(ierr); 112142550becSJunchao Zhang Aj = Aj_new; 112242550becSJunchao Zhang } 112342550becSJunchao Zhang 112442550becSJunchao Zhang ierr = PetscMalloc1(nnz,&Aa);CHKERRQ(ierr); 112542550becSJunchao Zhang ierr = MatCreateSeqAIJWithArrays(comm,M,N,Ai,Aj,Aa,&newmat);CHKERRQ(ierr); 112642550becSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(newmat->data); 112742550becSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; /* Let newmat own Ai,Aj,Aa */ 112842550becSJunchao Zhang aseq->free_a = aseq->free_ij = PETSC_TRUE; 112942550becSJunchao Zhang 113042550becSJunchao Zhang ierr = MatConvert(newmat,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 113142550becSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 113242550becSJunchao Zhang ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 113342550becSJunchao Zhang 113442550becSJunchao Zhang if (nneg) { /* Discard heading entries with negative indices in perm_h, as we'll access it from index 0 in MatSetValuesCOO */ 113542550becSJunchao Zhang MatRowMapKokkosViewHost newperm_h("perm",n-nneg); 113642550becSJunchao Zhang Kokkos::deep_copy(newperm_h,Kokkos::subview(perm_h,Kokkos::make_pair(nneg,n))); 113742550becSJunchao Zhang perm_h = newperm_h; 113842550becSJunchao Zhang } 113942550becSJunchao Zhang 114042550becSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(mat->spptr); 114142550becSJunchao Zhang akok->SetUpCOO(n,has_repeats,jmap_h,perm_h); 114242550becSJunchao Zhang PetscFunctionReturn(0); 114342550becSJunchao Zhang } 114442550becSJunchao Zhang 114542550becSJunchao Zhang static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode) 114642550becSJunchao Zhang { 114742550becSJunchao Zhang PetscErrorCode ierr; 114842550becSJunchao Zhang Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ*>(A->data); 114942550becSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 115042550becSJunchao Zhang PetscInt nz = aseq->nz; 115142550becSJunchao Zhang const MatRowMapKokkosView& jmap = akok->jmap_d; 115242550becSJunchao Zhang const MatRowMapKokkosView& perm = akok->perm_d; 115342550becSJunchao Zhang MatScalarKokkosView Aa; 115442550becSJunchao Zhang ConstMatScalarKokkosView kv; 115542550becSJunchao Zhang PetscMemType memtype; 115642550becSJunchao Zhang 115742550becSJunchao Zhang PetscFunctionBegin; 115842550becSJunchao Zhang if (!v) { /* NULL v means an all zero array */ 115942550becSJunchao Zhang ierr = MatZeroEntries(A);CHKERRQ(ierr); 116042550becSJunchao Zhang PetscFunctionReturn(0); 116142550becSJunchao Zhang } 116242550becSJunchao Zhang 116342550becSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 116442550becSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */ 116542550becSJunchao Zhang kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(),ConstMatScalarKokkosViewHost(v,akok->coo_n)); 116642550becSJunchao Zhang } else { 116742550becSJunchao Zhang kv = ConstMatScalarKokkosView(v,akok->coo_n); /* Directly use v[]'s memory */ 116842550becSJunchao Zhang } 116942550becSJunchao Zhang 117042550becSJunchao Zhang ierr = MatSeqAIJGetKokkosView(A,&Aa);CHKERRQ(ierr); /* Might read and write matrix values */ 117142550becSJunchao Zhang if (imode == INSERT_VALUES) { 117242550becSJunchao Zhang Kokkos::deep_copy(Aa,0.0); /* Zero matrix values since INSERT_VALUES still requires summing replicated values in v[] */ 117342550becSJunchao Zhang } 117442550becSJunchao Zhang 117542550becSJunchao Zhang if (akok->coo_has_repeats) { 117642550becSJunchao Zhang Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) { 117742550becSJunchao Zhang for (PetscInt k=jmap(i); k<jmap(i+1); k++) Aa(i) += kv(perm(k)); 117842550becSJunchao Zhang }); 117942550becSJunchao Zhang } else { 118042550becSJunchao Zhang Kokkos::parallel_for(nz,KOKKOS_LAMBDA(const PetscInt i) {Aa(i) += kv(perm(i));}); 118142550becSJunchao Zhang } 118242550becSJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(A,&Aa);CHKERRQ(ierr); 118342550becSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 118442550becSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 118542550becSJunchao Zhang PetscFunctionReturn(0); 118642550becSJunchao Zhang } 118742550becSJunchao Zhang 118886a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 11898f7e8f9dSMark Adams { 11908f7e8f9dSMark Adams PetscErrorCode ierr; 11918f7e8f9dSMark Adams 11928f7e8f9dSMark Adams PetscFunctionBegin; 11938f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 11948f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 11958f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 11968f7e8f9dSMark Adams PetscFunctionReturn(0); 11978f7e8f9dSMark Adams } 11988f7e8f9dSMark Adams 11998c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 12008c3ff71bSJunchao Zhang { 120142550becSJunchao Zhang PetscErrorCode ierr; 1202076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1203076ba34aSJunchao Zhang 12048c3ff71bSJunchao Zhang PetscFunctionBegin; 1205076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 12066f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 12076f3d89d0SStefano Zampini 12088c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 12098c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 12108c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1211a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1212f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1213a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1214076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 12158c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 12168c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 12178c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 12188c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 12198c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 12208c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1221076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 12220ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1223152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1224076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1225076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1226076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1227076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1228076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1229076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 123042550becSJunchao Zhang 123142550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_SeqAIJKokkos);CHKERRQ(ierr); 123242550becSJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos);CHKERRQ(ierr); 1233076ba34aSJunchao Zhang PetscFunctionReturn(0); 1234076ba34aSJunchao Zhang } 1235076ba34aSJunchao Zhang 1236076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1237076ba34aSJunchao Zhang { 1238076ba34aSJunchao Zhang PetscErrorCode ierr; 1239076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1240076ba34aSJunchao Zhang PetscInt i,m,n; 1241076ba34aSJunchao Zhang 1242076ba34aSJunchao Zhang PetscFunctionBegin; 1243076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1244076ba34aSJunchao Zhang 1245076ba34aSJunchao Zhang m = akok->nrows(); 1246076ba34aSJunchao Zhang n = akok->ncols(); 1247076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1248076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1249076ba34aSJunchao Zhang 1250076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1251076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1252076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1253076ba34aSJunchao Zhang 1254076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1255076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1256076ba34aSJunchao Zhang 1257076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1258076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1259076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1260076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1261076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1262076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1263076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1264076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1265076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1266076ba34aSJunchao Zhang 1267076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1268076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1269076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1270076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1271076ba34aSJunchao Zhang } 1272076ba34aSJunchao Zhang 1273076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1274076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1275076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1276076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1277076ba34aSJunchao Zhang A->spptr = akok; 1278076ba34aSJunchao Zhang PetscFunctionReturn(0); 1279076ba34aSJunchao Zhang } 1280076ba34aSJunchao Zhang 1281076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1282076ba34aSJunchao Zhang 1283076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1284076ba34aSJunchao Zhang */ 1285076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1286076ba34aSJunchao Zhang { 1287076ba34aSJunchao Zhang PetscErrorCode ierr; 1288076ba34aSJunchao Zhang 1289076ba34aSJunchao Zhang PetscFunctionBegin; 1290076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1291076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 12928c3ff71bSJunchao Zhang PetscFunctionReturn(0); 12938c3ff71bSJunchao Zhang } 12948c3ff71bSJunchao Zhang 12958c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1296152b3e56SJunchao Zhang /*@C 12978c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 12988c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 12998c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 13008c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 13018c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 13028c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 13038c3ff71bSJunchao Zhang 13048c3ff71bSJunchao Zhang Collective 13058c3ff71bSJunchao Zhang 13068c3ff71bSJunchao Zhang Input Parameters: 13078c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 13088c3ff71bSJunchao Zhang . m - number of rows 13098c3ff71bSJunchao Zhang . n - number of columns 13108c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 13118c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 13128c3ff71bSJunchao Zhang (possibly different for each row) or NULL 13138c3ff71bSJunchao Zhang 13148c3ff71bSJunchao Zhang Output Parameter: 13158c3ff71bSJunchao Zhang . A - the matrix 13168c3ff71bSJunchao Zhang 13178c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 13188c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 13198c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 13208c3ff71bSJunchao Zhang 13218c3ff71bSJunchao Zhang Notes: 13228c3ff71bSJunchao Zhang If nnz is given then nz is ignored 13238c3ff71bSJunchao Zhang 13248c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 13258c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 13268c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 13278c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 13288c3ff71bSJunchao Zhang 13298c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 13308c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 13318c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 13328c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 13338c3ff71bSJunchao Zhang 13348c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 13358c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 13368c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 13378c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 13388c3ff71bSJunchao Zhang 13398c3ff71bSJunchao Zhang Level: intermediate 13408c3ff71bSJunchao Zhang 1341076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 13428c3ff71bSJunchao Zhang @*/ 13438c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 13448c3ff71bSJunchao Zhang { 13458c3ff71bSJunchao Zhang PetscErrorCode ierr; 13468c3ff71bSJunchao Zhang 13478c3ff71bSJunchao Zhang PetscFunctionBegin; 13488c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 13498c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 13508c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 13518c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 13528c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 13538c3ff71bSJunchao Zhang PetscFunctionReturn(0); 13548c3ff71bSJunchao Zhang } 1355930e68a5SMark Adams 13568f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 13578f7e8f9dSMark Adams // 135846804e07SMark Adams // This factorization exploits block diagonal matrices with "Nf" (not used). 13598f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 13608f7e8f9dSMark Adams // 13618f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1362930e68a5SMark Adams { 13638f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 13648f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 13658f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1366930e68a5SMark Adams PetscErrorCode ierr; 13678f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1368076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1369076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1370076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 13718f7e8f9dSMark Adams PetscBool row_identity,col_identity; 137246804e07SMark Adams PetscInt nc, Nf=1, nVec=32; // should be a parameter, Nf is batch size - not used 1373930e68a5SMark Adams 1374930e68a5SMark Adams PetscFunctionBegin; 1375c0aa6a63SJacob Faibussowitsch if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %" PetscInt_FMT " %" PetscInt_FMT,A->rmap->n,n); 13768f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 13778f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 13788f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 13798f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 13808f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 13818f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 13828f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 13838f7e8f9dSMark Adams { 13848f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 13858f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 13868f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 13878f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 13888f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 13898f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 13908f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 13918f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 13928f7e8f9dSMark Adams size_t flops_h = 0.0; 13938f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 13948f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 13958f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 13968f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 13978f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 13988f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 13998f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 14008f7e8f9dSMark Adams // Fill A --> fact 14018f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1402042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 14038f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 14048f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 14058f7e8f9dSMark Adams // zero rows of B 14068f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 14078f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 14088f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 14098f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 14108f7e8f9dSMark Adams /* zero (unfactored row) */ 14118f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 14128f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 14138f7e8f9dSMark Adams }); 14148f7e8f9dSMark Adams // copy A into B 14158f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 14168f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 14178f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 14188f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 14198f7e8f9dSMark Adams /* load in initial (unfactored row) */ 14208f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 14218f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 14228f7e8f9dSMark Adams PetscScalar vala = av[j]; 14238f7e8f9dSMark Adams if (colb == rowb) { 14248f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 14258f7e8f9dSMark Adams } else { 14268f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 14278f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 14288f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 14298f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 14308f7e8f9dSMark Adams if (pbj[j] == colb) { 14318f7e8f9dSMark Adams pba[j] = vala; 14328f7e8f9dSMark Adams set++; 14338f7e8f9dSMark Adams break; 14348f7e8f9dSMark Adams } 14358f7e8f9dSMark Adams } 1436*8f1da0b2SJunchao Zhang #if !defined(PETSC_HAVE_SYCL) 14378f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 1438*8f1da0b2SJunchao Zhang #endif 14398f7e8f9dSMark Adams } 14408f7e8f9dSMark Adams } 14418f7e8f9dSMark Adams }); 14428f7e8f9dSMark Adams }); 14438f7e8f9dSMark Adams Kokkos::fence(); 1444930e68a5SMark Adams 14458f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 14468f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 14478f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 14488f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1449042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 14508f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 14518f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 14528f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 14538f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 14548f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 14558f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 14568f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 14578f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 14588f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 14598f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 14608f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 14618f7e8f9dSMark Adams else { 14628f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 14638f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 14648f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 14658f7e8f9dSMark Adams size_t st_idx; 14668f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 14678f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 14688f7e8f9dSMark Adams // get column, there has got to be a better way 14698f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 14708f7e8f9dSMark Adams if (pjL[j] == ii) { 14718f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 14728f7e8f9dSMark Adams idx = j; // output 14738f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 14748f7e8f9dSMark Adams } 14758f7e8f9dSMark Adams }, st_idx); 14768f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 1477*8f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 147899551766SMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",(int)myk,ii); // uses a register 147999551766SMark Adams #endif 148099551766SMark Adams // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 14818f7e8f9dSMark Adams // U(i+1,:end) 14828f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 14838f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 14848f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 14858f7e8f9dSMark Adams if (col==myk) { 14868f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 14878f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 14888f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 14898f7e8f9dSMark Adams } else { 14908f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 14918f7e8f9dSMark Adams PetscInt high, low; 14928f7e8f9dSMark Adams const PetscInt *startj; 14938f7e8f9dSMark Adams if (col<myk) { // L 14948f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 14958f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 14968f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 14978f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 14988f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 14998f7e8f9dSMark Adams } else { 15008f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 15018f7e8f9dSMark Adams start = ba_d + idx; 15028f7e8f9dSMark Adams startj= bj_d + idx; 15038f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 15048f7e8f9dSMark Adams } 15058f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 15068f7e8f9dSMark Adams low = 0; 15078f7e8f9dSMark Adams high = (PetscInt)(end-start); 15088f7e8f9dSMark Adams while (high-low > 5) { 15098f7e8f9dSMark Adams int t = (low+high)/2; 15108f7e8f9dSMark Adams if (startj[t] > col) high = t; 15118f7e8f9dSMark Adams else low = t; 15128f7e8f9dSMark Adams } 15138f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 15148f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 15158f7e8f9dSMark Adams } 1516*8f1da0b2SJunchao Zhang #if defined(PETSC_USE_DEBUG) && !defined(PETSC_HAVE_SYCL) 151799551766SMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",(int)myk,(int)col); // uses a register 151899551766SMark Adams #endif 15198f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 15208f7e8f9dSMark Adams } 15218f7e8f9dSMark Adams }); 15228f7e8f9dSMark Adams } 15238f7e8f9dSMark Adams }); 15248f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 15258f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 15268f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 15278f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 15288f7e8f9dSMark Adams }); 15298f7e8f9dSMark Adams Kokkos::fence(); 15308f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 15318f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 15328f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 15338f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 15348f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 15358f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 15368f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 15378f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 15388f7e8f9dSMark Adams if (i < end) { 15398f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 15408f7e8f9dSMark Adams *pv = 1.0/(*pv); 15418f7e8f9dSMark Adams } 15428f7e8f9dSMark Adams }); 15438f7e8f9dSMark Adams }); 15448f7e8f9dSMark Adams } 15458f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 15468f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 15478f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 15488f7e8f9dSMark Adams 15498f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 15508f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 15518f7e8f9dSMark Adams if (b->inode.size) { 15528f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 15538f7e8f9dSMark Adams } else if (row_identity && col_identity) { 15548f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 15558f7e8f9dSMark Adams } else { 15568f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 15578f7e8f9dSMark Adams } 15588f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 15598f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 15608f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 15618f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 15628f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 15638f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 15648f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 15658f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 15668f7e8f9dSMark Adams 1567930e68a5SMark Adams PetscFunctionReturn(0); 1568930e68a5SMark Adams } 1569930e68a5SMark Adams 157086a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1571930e68a5SMark Adams { 1572930e68a5SMark Adams PetscErrorCode ierr; 1573930e68a5SMark Adams 1574930e68a5SMark Adams PetscFunctionBegin; 1575930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 157686a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 157786a27549SJunchao Zhang PetscFunctionReturn(0); 157886a27549SJunchao Zhang } 157986a27549SJunchao Zhang 158086a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 158186a27549SJunchao Zhang { 158286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 158386a27549SJunchao Zhang 158486a27549SJunchao Zhang PetscFunctionBegin; 158586a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 158686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 158786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 158886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 158986a27549SJunchao Zhang } 159086a27549SJunchao Zhang PetscFunctionReturn(0); 159186a27549SJunchao Zhang } 159286a27549SJunchao Zhang 159386a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 159486a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 159586a27549SJunchao Zhang { 159686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1597076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 159886a27549SJunchao Zhang 159986a27549SJunchao Zhang PetscFunctionBegin; 160086a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 160186a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1602076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 160386a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1604076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1605076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 160686a27549SJunchao Zhang 160786a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1608076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1609076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1610076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 161186a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 161286a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 161386a27549SJunchao Zhang 161486a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 161586a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 161686a27549SJunchao Zhang */ 1617076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1618076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 161986a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 162086a27549SJunchao Zhang 162186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 162286a27549SJunchao Zhang 162386a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1624076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 162586a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1626076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1627076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 162886a27549SJunchao Zhang 162986a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1630076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1631076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1632076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 163386a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 163486a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 163586a27549SJunchao Zhang 163686a27549SJunchao Zhang /* Sort indices. See comments above */ 1637076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1638076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 163986a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 164086a27549SJunchao Zhang 164186a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 164286a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 164386a27549SJunchao Zhang } 164486a27549SJunchao Zhang PetscFunctionReturn(0); 164586a27549SJunchao Zhang } 164686a27549SJunchao Zhang 164786a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 164886a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 164986a27549SJunchao Zhang { 165086a27549SJunchao Zhang PetscErrorCode ierr; 165186a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 165286a27549SJunchao Zhang PetscScalarKokkosView xv; 165386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 165486a27549SJunchao Zhang 165586a27549SJunchao Zhang PetscFunctionBegin; 1656eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 165786a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 165886a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 165986a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 166086a27549SJunchao Zhang /* Solve L tmpv = b */ 16613f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 166286a27549SJunchao Zhang /* Solve Ux = tmpv */ 16633f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 166486a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 166586a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1666eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 166786a27549SJunchao Zhang PetscFunctionReturn(0); 166886a27549SJunchao Zhang } 166986a27549SJunchao Zhang 1670076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 167186a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 167286a27549SJunchao Zhang { 167386a27549SJunchao Zhang PetscErrorCode ierr; 167486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 167586a27549SJunchao Zhang PetscScalarKokkosView xv; 167686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 167786a27549SJunchao Zhang 167886a27549SJunchao Zhang PetscFunctionBegin; 1679eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 168086a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 168186a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 168286a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 168386a27549SJunchao Zhang /* Solve U^T tmpv = b */ 168486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 168586a27549SJunchao Zhang 168686a27549SJunchao Zhang /* Solve L^T x = tmpv */ 168786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 168886a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 168986a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 1690eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 169186a27549SJunchao Zhang PetscFunctionReturn(0); 169286a27549SJunchao Zhang } 169386a27549SJunchao Zhang 169486a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 169586a27549SJunchao Zhang { 169686a27549SJunchao Zhang PetscErrorCode ierr; 169786a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 169886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 169986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 170086a27549SJunchao Zhang 170186a27549SJunchao Zhang PetscFunctionBegin; 1702eeadb341SJunchao Zhang ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 170386a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1704076ba34aSJunchao Zhang 1705076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1706076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1707076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1708076ba34aSJunchao Zhang 1709076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 171086a27549SJunchao Zhang 171186a27549SJunchao Zhang B->assembled = PETSC_TRUE; 171286a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 171386a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 171486a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 171586a27549SJunchao Zhang B->ops->matsolve = NULL; 171686a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 171786a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 171886a27549SJunchao Zhang 171986a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 172086a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 172186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 1722eeadb341SJunchao Zhang /* TODO: log flops, but how to know that? */ 1723eeadb341SJunchao Zhang ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 172486a27549SJunchao Zhang PetscFunctionReturn(0); 172586a27549SJunchao Zhang } 172686a27549SJunchao Zhang 172786a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 172886a27549SJunchao Zhang { 172986a27549SJunchao Zhang PetscErrorCode ierr; 173086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 173186a27549SJunchao Zhang Mat_SeqAIJ *b; 173286a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 173386a27549SJunchao Zhang PetscInt fill_lev = info->levels; 173486a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 173586a27549SJunchao Zhang PetscInt n = A->rmap->n; 173686a27549SJunchao Zhang 173786a27549SJunchao Zhang PetscFunctionBegin; 173886a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 173986a27549SJunchao Zhang /* Rebuild factors */ 174086a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 174186a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 174286a27549SJunchao Zhang 174386a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 174486a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 174586a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 174686a27549SJunchao Zhang 174786a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 174886a27549SJunchao Zhang 174986a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 175086a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 175186a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 175286a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 175386a27549SJunchao Zhang 175486a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1755076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1756076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1757076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 175886a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 175986a27549SJunchao Zhang 176086a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 176186a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 176286a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 176386a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 176486a27549SJunchao Zhang 176586a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 176686a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 176786a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 176886a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 176986a27549SJunchao Zhang #else 177086a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 177186a27549SJunchao Zhang #endif 177286a27549SJunchao Zhang 177386a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 177486a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 177586a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 177686a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 177786a27549SJunchao Zhang 177886a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 177986a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 178086a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 178186a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 178286a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 178386a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 178486a27549SJunchao Zhang 178586a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 178686a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1787930e68a5SMark Adams PetscFunctionReturn(0); 1788930e68a5SMark Adams } 1789930e68a5SMark Adams 17908f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 17918f7e8f9dSMark Adams { 17928f7e8f9dSMark Adams PetscErrorCode ierr; 17938f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 17948f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1795930e68a5SMark Adams 17968f7e8f9dSMark Adams PetscFunctionBegin; 17978f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 17988f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 17998f7e8f9dSMark Adams // move B data into Kokkos 18008f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 18018f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 18028f7e8f9dSMark Adams { 18038f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 18048f7e8f9dSMark Adams if (!baijkok->diag_d) { 18058f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1806152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 18078f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 18088f7e8f9dSMark Adams } 18098f7e8f9dSMark Adams } 18108f7e8f9dSMark Adams PetscFunctionReturn(0); 18118f7e8f9dSMark Adams } 18128f7e8f9dSMark Adams 181386a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1814930e68a5SMark Adams { 1815930e68a5SMark Adams PetscFunctionBegin; 1816930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1817930e68a5SMark Adams PetscFunctionReturn(0); 1818930e68a5SMark Adams } 1819930e68a5SMark Adams 18208f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 18218f7e8f9dSMark Adams { 18228f7e8f9dSMark Adams PetscFunctionBegin; 18238f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 18248f7e8f9dSMark Adams PetscFunctionReturn(0); 18258f7e8f9dSMark Adams } 18268f7e8f9dSMark Adams 1827930e68a5SMark Adams /*MC 182886a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 182986a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1830930e68a5SMark Adams 1831930e68a5SMark Adams Level: beginner 1832930e68a5SMark Adams 183386a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1834930e68a5SMark Adams M*/ 183586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1836930e68a5SMark Adams { 1837930e68a5SMark Adams PetscErrorCode ierr; 1838930e68a5SMark Adams PetscInt n = A->rmap->n; 1839930e68a5SMark Adams 1840930e68a5SMark Adams PetscFunctionBegin; 1841930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1842930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1843930e68a5SMark Adams (*B)->factortype = ftype; 18444ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1845930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1846930e68a5SMark Adams 18478f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1848930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 184986a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 185086a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 185186a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 185286a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 185386a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 185486a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 185586a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1856930e68a5SMark Adams 1857930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 185886a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1859930e68a5SMark Adams PetscFunctionReturn(0); 1860930e68a5SMark Adams } 18618f7e8f9dSMark Adams 18628f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 18638f7e8f9dSMark Adams { 18648f7e8f9dSMark Adams PetscErrorCode ierr; 18658f7e8f9dSMark Adams PetscInt n = A->rmap->n; 18668f7e8f9dSMark Adams 18678f7e8f9dSMark Adams PetscFunctionBegin; 18688f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 18698f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 18708f7e8f9dSMark Adams (*B)->factortype = ftype; 1871f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 18724ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 18738f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 18748f7e8f9dSMark Adams 18758f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 18768f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 18778f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 18788f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 18798f7e8f9dSMark Adams 18808f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 18818f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 18828f7e8f9dSMark Adams PetscFunctionReturn(0); 18838f7e8f9dSMark Adams } 188486a27549SJunchao Zhang 188586a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 188686a27549SJunchao Zhang { 188786a27549SJunchao Zhang PetscErrorCode ierr; 188886a27549SJunchao Zhang 188986a27549SJunchao Zhang PetscFunctionBegin; 189086a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 189186a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 189286a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 189386a27549SJunchao Zhang PetscFunctionReturn(0); 189486a27549SJunchao Zhang } 189586a27549SJunchao Zhang 1896076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1897076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1898076ba34aSJunchao Zhang { 1899076ba34aSJunchao Zhang PetscErrorCode ierr; 1900076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1901076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1902076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1903076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1904076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1905076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1906076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1907076ba34aSJunchao Zhang 1908076ba34aSJunchao Zhang PetscFunctionBegin; 1909c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1910076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1911c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT ": ",k);CHKERRQ(ierr); 1912076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1913c0aa6a63SJacob Faibussowitsch ierr = PetscPrintf(PETSC_COMM_SELF,"%" PetscInt_FMT "(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1914076ba34aSJunchao Zhang } 1915076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1916076ba34aSJunchao Zhang } 1917076ba34aSJunchao Zhang PetscFunctionReturn(0); 1918076ba34aSJunchao Zhang } 1919