111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 48c3ff71bSJunchao Zhang #include <petscsystypes.h> 58c3ff71bSJunchao Zhang #include <petscerror.h> 68c3ff71bSJunchao Zhang 78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 8f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 6186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 7986a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 8486a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8586a27549SJunchao Zhang PetscFunctionReturn(0); 8686a27549SJunchao Zhang } 8786a27549SJunchao Zhang 88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89f0cf5187SStefano Zampini { 90f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91f0cf5187SStefano Zampini 92f0cf5187SStefano Zampini PetscFunctionBegin; 93f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9586a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 98f0cf5187SStefano Zampini PetscFunctionReturn(0); 99f0cf5187SStefano Zampini } 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102f0cf5187SStefano Zampini { 103076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104f0cf5187SStefano Zampini 105f0cf5187SStefano Zampini PetscFunctionBegin; 106076ba34aSJunchao Zhang if (aijkok) { 107076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 108076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 109076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 110076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111076ba34aSJunchao Zhang } 112076ba34aSJunchao Zhang PetscFunctionReturn(0); 113076ba34aSJunchao Zhang } 114076ba34aSJunchao Zhang 115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116076ba34aSJunchao Zhang { 117076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118076ba34aSJunchao Zhang 119076ba34aSJunchao Zhang PetscFunctionBegin; 120076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 121076ba34aSJunchao Zhang PetscFunctionReturn(0); 122076ba34aSJunchao Zhang } 123076ba34aSJunchao Zhang 124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125076ba34aSJunchao Zhang { 126076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 129076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 130076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 131076ba34aSJunchao Zhang PetscFunctionReturn(0); 132076ba34aSJunchao Zhang } 133076ba34aSJunchao Zhang 134076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135076ba34aSJunchao Zhang { 136076ba34aSJunchao Zhang PetscFunctionBegin; 137076ba34aSJunchao Zhang *array = NULL; 138076ba34aSJunchao Zhang PetscFunctionReturn(0); 139076ba34aSJunchao Zhang } 140076ba34aSJunchao Zhang 141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142076ba34aSJunchao Zhang { 143076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144076ba34aSJunchao Zhang 145076ba34aSJunchao Zhang PetscFunctionBegin; 146076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 147076ba34aSJunchao Zhang PetscFunctionReturn(0); 148076ba34aSJunchao Zhang } 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151076ba34aSJunchao Zhang { 152076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153076ba34aSJunchao Zhang 154076ba34aSJunchao Zhang PetscFunctionBegin; 155076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 156076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 157f0cf5187SStefano Zampini PetscFunctionReturn(0); 158f0cf5187SStefano Zampini } 159f0cf5187SStefano Zampini 160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162a587d139SMark { 163042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165a587d139SMark 166a587d139SMark PetscFunctionBegin; 167076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170a587d139SMark PetscFunctionReturn(0); 171a587d139SMark } 172a587d139SMark 173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175a587d139SMark { 176042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177a587d139SMark 178a587d139SMark PetscFunctionBegin; 179a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 180a587d139SMark *d_mat = aijkok->device_mat_d.data(); 181a587d139SMark } else { 182a587d139SMark PetscErrorCode ierr; 183a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184a587d139SMark *d_mat = NULL; 185a587d139SMark } 186a587d139SMark PetscFunctionReturn(0); 187a587d139SMark } 188076ba34aSJunchao Zhang 189076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 190076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191152b3e56SJunchao Zhang { 192152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193152b3e56SJunchao Zhang 194152b3e56SJunchao Zhang PetscFunctionBegin; 195076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 199076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 20186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 202076ba34aSJunchao Zhang } 203076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 204152b3e56SJunchao Zhang PetscFunctionReturn(0); 205152b3e56SJunchao Zhang } 206152b3e56SJunchao Zhang 207076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 208076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209152b3e56SJunchao Zhang { 210152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211152b3e56SJunchao Zhang 212152b3e56SJunchao Zhang PetscFunctionBegin; 213076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 216076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 219076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 220076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221076ba34aSJunchao Zhang #endif 22286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 223076ba34aSJunchao Zhang } 224076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 225152b3e56SJunchao Zhang PetscFunctionReturn(0); 226152b3e56SJunchao Zhang } 227a587d139SMark 2288c3ff71bSJunchao Zhang /* y = A x */ 2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2308c3ff71bSJunchao Zhang { 2318c3ff71bSJunchao Zhang PetscErrorCode ierr; 2328c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 233152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 234152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2358c3ff71bSJunchao Zhang 2368c3ff71bSJunchao Zhang PetscFunctionBegin; 2378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2408c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 245076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2478c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2488c3ff71bSJunchao Zhang } 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang /* y = A^T x */ 2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2528c3ff71bSJunchao Zhang { 2538c3ff71bSJunchao Zhang PetscErrorCode ierr; 2548c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 255152b3e56SJunchao Zhang const char *mode; 256152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 257152b3e56SJunchao Zhang PetscScalarKokkosView yv; 258076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2598c3ff71bSJunchao Zhang 2608c3ff71bSJunchao Zhang PetscFunctionBegin; 2618c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 265076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266152b3e56SJunchao Zhang mode = "N"; 267152b3e56SJunchao Zhang } else { 268076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 270152b3e56SJunchao Zhang mode = "T"; 271152b3e56SJunchao Zhang } 272076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 276076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2788c3ff71bSJunchao Zhang } 2798c3ff71bSJunchao Zhang 2808c3ff71bSJunchao Zhang /* y = A^H x */ 2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2828c3ff71bSJunchao Zhang { 2838c3ff71bSJunchao Zhang PetscErrorCode ierr; 2848c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 285152b3e56SJunchao Zhang const char *mode; 286152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 287152b3e56SJunchao Zhang PetscScalarKokkosView yv; 288076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang PetscFunctionBegin; 2918c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 295076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296152b3e56SJunchao Zhang mode = "N"; 297152b3e56SJunchao Zhang } else { 298076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 300152b3e56SJunchao Zhang mode = "C"; 301152b3e56SJunchao Zhang } 302076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 306076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3078c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3088c3ff71bSJunchao Zhang } 3098c3ff71bSJunchao Zhang 3108c3ff71bSJunchao Zhang /* z = A x + y */ 3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3128c3ff71bSJunchao Zhang { 3138c3ff71bSJunchao Zhang PetscErrorCode ierr; 3148c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 315152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 316152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3178c3ff71bSJunchao Zhang 3188c3ff71bSJunchao Zhang PetscFunctionBegin; 3198c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3238c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3248c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 330152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3318c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3328c3ff71bSJunchao Zhang } 3338c3ff71bSJunchao Zhang 3348c3ff71bSJunchao Zhang /* z = A^T x + y */ 3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3368c3ff71bSJunchao Zhang { 3378c3ff71bSJunchao Zhang PetscErrorCode ierr; 3388c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 339152b3e56SJunchao Zhang const char *mode; 340152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 341152b3e56SJunchao Zhang PetscScalarKokkosView zv; 342076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3438c3ff71bSJunchao Zhang 3448c3ff71bSJunchao Zhang PetscFunctionBegin; 3458c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 350152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 351076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352152b3e56SJunchao Zhang mode = "N"; 353152b3e56SJunchao Zhang } else { 354076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 356152b3e56SJunchao Zhang mode = "T"; 357152b3e56SJunchao Zhang } 358076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 363076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3658c3ff71bSJunchao Zhang } 3668c3ff71bSJunchao Zhang 3678c3ff71bSJunchao Zhang /* z = A^H x + y */ 3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3698c3ff71bSJunchao Zhang { 3708c3ff71bSJunchao Zhang PetscErrorCode ierr; 3718c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 372152b3e56SJunchao Zhang const char *mode; 373152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 374152b3e56SJunchao Zhang PetscScalarKokkosView zv; 375076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3768c3ff71bSJunchao Zhang 3778c3ff71bSJunchao Zhang PetscFunctionBegin; 3788c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 383152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 384076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385152b3e56SJunchao Zhang mode = "N"; 386152b3e56SJunchao Zhang } else { 387076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 389152b3e56SJunchao Zhang mode = "C"; 390152b3e56SJunchao Zhang } 391076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 396076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397152b3e56SJunchao Zhang PetscFunctionReturn(0); 398152b3e56SJunchao Zhang } 399152b3e56SJunchao Zhang 400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401152b3e56SJunchao Zhang { 402152b3e56SJunchao Zhang PetscErrorCode ierr; 403152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404152b3e56SJunchao Zhang 405152b3e56SJunchao Zhang PetscFunctionBegin; 406152b3e56SJunchao Zhang switch (op) { 407152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 408152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 411152b3e56SJunchao Zhang break; 412152b3e56SJunchao Zhang default: 413152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414152b3e56SJunchao Zhang break; 415152b3e56SJunchao Zhang } 4168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4178c3ff71bSJunchao Zhang } 4188c3ff71bSJunchao Zhang 419076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4218c3ff71bSJunchao Zhang { 4228c3ff71bSJunchao Zhang PetscErrorCode ierr; 423076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4248c3ff71bSJunchao Zhang 4258c3ff71bSJunchao Zhang PetscFunctionBegin; 426a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4298c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 438076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4418c3ff71bSJunchao Zhang } 442076ba34aSJunchao Zhang } 4438c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4448c3ff71bSJunchao Zhang } 4458c3ff71bSJunchao Zhang 446076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448076ba34aSJunchao Zhang */ 449076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4508c3ff71bSJunchao Zhang { 4518c3ff71bSJunchao Zhang PetscErrorCode ierr; 452076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 453076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454076ba34aSJunchao Zhang Mat mat; 4558c3ff71bSJunchao Zhang 4568c3ff71bSJunchao Zhang PetscFunctionBegin; 457076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459076ba34aSJunchao Zhang mat = *B; 460076ba34aSJunchao Zhang if (A->assembled) { 461076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464076ba34aSJunchao Zhang /* Now copy values to B if needed */ 465076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 466076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 467076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 469076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 470076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 472076ba34aSJunchao Zhang } 473076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 476076ba34aSJunchao Zhang } 477076ba34aSJunchao Zhang mat->spptr = bkok; 478076ba34aSJunchao Zhang } 479076ba34aSJunchao Zhang 480076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 4848c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4858c3ff71bSJunchao Zhang } 4868c3ff71bSJunchao Zhang 487*0ecb592aSJunchao Zhang static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat *B) 488*0ecb592aSJunchao Zhang { 489*0ecb592aSJunchao Zhang PetscErrorCode ierr; 490*0ecb592aSJunchao Zhang Mat At; 491*0ecb592aSJunchao Zhang KokkosCsrMatrix *internT,*csrmatT; 492*0ecb592aSJunchao Zhang Mat_SeqAIJKokkos *atkok,*bkok; 493*0ecb592aSJunchao Zhang 494*0ecb592aSJunchao Zhang PetscFunctionBegin; 495*0ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&internT);CHKERRQ(ierr); /* Generate a transpose internally */ 496*0ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) { 497*0ecb592aSJunchao Zhang CHKERRCXX(csrmatT = new KokkosCsrMatrix("csrmat",*internT)); /* Deep copy internT to csrmatT, as we want to isolate the internal transpose */ 498*0ecb592aSJunchao Zhang CHKERRCXX(atkok = new Mat_SeqAIJKokkos(*csrmatT)); 499*0ecb592aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A),atkok,&At);CHKERRQ(ierr); 500*0ecb592aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) *B = At; 501*0ecb592aSJunchao Zhang else {ierr = MatHeaderMerge(A,&At);CHKERRQ(ierr);} /* Replace A with At inplace */ 502*0ecb592aSJunchao Zhang } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */ 503*0ecb592aSJunchao Zhang if ((*B)->assembled) { 504*0ecb592aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>((*B)->spptr); 505*0ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(bkok->a_dual.view_device(),internT->values)); 506*0ecb592aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*B);CHKERRQ(ierr); 507*0ecb592aSJunchao Zhang } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */ 508*0ecb592aSJunchao Zhang Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ*>((*B)->data); 509*0ecb592aSJunchao Zhang MatScalarKokkosViewHost a_h(bseq->a,internT->nnz()); /* bseq->nz = 0 if unassembled */ 510*0ecb592aSJunchao Zhang MatColIdxKokkosViewHost j_h(bseq->j,internT->nnz()); 511*0ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(a_h,internT->values)); 512*0ecb592aSJunchao Zhang CHKERRCXX(Kokkos::deep_copy(j_h,internT->graph.entries)); 513*0ecb592aSJunchao Zhang } else SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_ARG_WRONGSTATE,"B must be assembled or preallocated"); 514*0ecb592aSJunchao Zhang } 515*0ecb592aSJunchao Zhang PetscFunctionReturn(0); 516*0ecb592aSJunchao Zhang } 517*0ecb592aSJunchao Zhang 5188c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 5198c3ff71bSJunchao Zhang { 5208c3ff71bSJunchao Zhang PetscErrorCode ierr; 52186a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 5228c3ff71bSJunchao Zhang 5238c3ff71bSJunchao Zhang PetscFunctionBegin; 52486a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 52586a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 5268f7e8f9dSMark Adams if (aijkok) { 5278f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 528a587d139SMark delete aijkok->colmap_d; 529a587d139SMark delete aijkok->i_uncompressed_d; 530a587d139SMark } 5318f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5328f7e8f9dSMark Adams } 5338c3ff71bSJunchao Zhang delete aijkok; 53486a27549SJunchao Zhang } else { 53586a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 53686a27549SJunchao Zhang } 537152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 538152b3e56SJunchao Zhang A->spptr = NULL; 5398c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5408c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5418c3ff71bSJunchao Zhang } 5428c3ff71bSJunchao Zhang 54386a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 54486a27549SJunchao Zhang { 54586a27549SJunchao Zhang PetscErrorCode ierr; 54686a27549SJunchao Zhang 54786a27549SJunchao Zhang PetscFunctionBegin; 54886a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 54986a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 55086a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 55186a27549SJunchao Zhang PetscFunctionReturn(0); 55286a27549SJunchao Zhang } 55386a27549SJunchao Zhang 554076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 555076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 556a3f881fbSStefano Zampini { 557076ba34aSJunchao Zhang PetscErrorCode ierr; 558076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 559076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 560076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 561076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 562076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 563076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 564a3f881fbSStefano Zampini 565a3f881fbSStefano Zampini PetscFunctionBegin; 566076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 567076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 568076ba34aSJunchao Zhang PetscValidPointer(C,4); 569076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 570076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 571076ba34aSJunchao Zhang if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 572076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 573076ba34aSJunchao Zhang 574076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 575076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 576076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 577076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 578076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 579076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 580076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 581076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 582076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 583076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 584076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 585076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 586076ba34aSJunchao Zhang nnz = a->nz + b->nz; 587076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 588076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 589076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 590076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 591076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 592076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 593076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 594076ba34aSJunchao Zhang ca = ca_dual.view_device(); 595076ba34aSJunchao Zhang ci = ci_dual.view_device(); 596076ba34aSJunchao Zhang cj = cj_dual.view_device(); 597076ba34aSJunchao Zhang 598076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 599076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 601076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602076ba34aSJunchao Zhang 603076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 604076ba34aSJunchao Zhang ci(i) = coffset; 605076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 606076ba34aSJunchao Zhang }); 607076ba34aSJunchao Zhang 608076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 609076ba34aSJunchao Zhang if (k < alen) { 610076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 611076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 612076ba34aSJunchao Zhang } else { 613076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 614076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 615076ba34aSJunchao Zhang } 616076ba34aSJunchao Zhang }); 617076ba34aSJunchao Zhang }); 618076ba34aSJunchao Zhang ca_dual.modify_device(); 619076ba34aSJunchao Zhang ci_dual.modify_device(); 620076ba34aSJunchao Zhang cj_dual.modify_device(); 621076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 622076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 623076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 624076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 625076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 626076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 627076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 628076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 629076ba34aSJunchao Zhang 630076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 631076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 632076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 633076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 634076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 635076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 636076ba34aSJunchao Zhang }); 637076ba34aSJunchao Zhang }); 638076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 639076ba34aSJunchao Zhang } 640076ba34aSJunchao Zhang PetscFunctionReturn(0); 641076ba34aSJunchao Zhang } 642076ba34aSJunchao Zhang 643076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 644076ba34aSJunchao Zhang { 645076ba34aSJunchao Zhang PetscFunctionBegin; 646076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 647a3f881fbSStefano Zampini PetscFunctionReturn(0); 648a3f881fbSStefano Zampini } 649a3f881fbSStefano Zampini 650a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 651a3f881fbSStefano Zampini { 652076ba34aSJunchao Zhang PetscErrorCode ierr; 653a3f881fbSStefano Zampini Mat_Product *product = C->product; 654a3f881fbSStefano Zampini Mat A,B; 655076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 656a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 657a3f881fbSStefano Zampini Mat_SeqAIJ *c; 658076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 659076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 660a3f881fbSStefano Zampini 661a3f881fbSStefano Zampini PetscFunctionBegin; 662a3f881fbSStefano Zampini MatCheckProduct(C,1); 663a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 664076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 665076ba34aSJunchao Zhang 666076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 667076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 668076ba34aSJunchao Zhang PetscFunctionReturn(0); 669076ba34aSJunchao Zhang } 670076ba34aSJunchao Zhang 671076ba34aSJunchao Zhang switch (product->type) { 672076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 673076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 674076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 675076ba34aSJunchao Zhang default: 676076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 677076ba34aSJunchao Zhang } 678076ba34aSJunchao Zhang 679a3f881fbSStefano Zampini A = product->A; 680a3f881fbSStefano Zampini B = product->B; 681a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 682a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 683a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 684a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 685a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 686076ba34aSJunchao Zhang 687076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 688076ba34aSJunchao Zhang 689076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 690076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 691076ba34aSJunchao Zhang 692076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 693076ba34aSJunchao Zhang if (transA) { 694076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 695076ba34aSJunchao Zhang transA = false; 696a3f881fbSStefano Zampini } 697a3f881fbSStefano Zampini 698076ba34aSJunchao Zhang if (transB) { 699076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 700076ba34aSJunchao Zhang transB = false; 701076ba34aSJunchao Zhang } 702076ba34aSJunchao Zhang 703076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 704076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 705076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 706a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 707a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 708a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 709a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 710a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 711a3f881fbSStefano Zampini c->reallocs = 0; 712076ba34aSJunchao Zhang C->info.mallocs = 0; 713a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 714a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 715a3f881fbSStefano Zampini C->num_ass++; 716a3f881fbSStefano Zampini PetscFunctionReturn(0); 717a3f881fbSStefano Zampini } 718a3f881fbSStefano Zampini 719a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 720a3f881fbSStefano Zampini { 721a3f881fbSStefano Zampini PetscErrorCode ierr; 722076ba34aSJunchao Zhang Mat_Product *product = C->product; 723076ba34aSJunchao Zhang MatProductType ptype; 724076ba34aSJunchao Zhang Mat A,B; 725076ba34aSJunchao Zhang bool transA,transB; 726076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 727076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 728076ba34aSJunchao Zhang MPI_Comm comm; 729076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 730a3f881fbSStefano Zampini 731a3f881fbSStefano Zampini PetscFunctionBegin; 732a3f881fbSStefano Zampini MatCheckProduct(C,1); 733076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 734076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 735a3f881fbSStefano Zampini A = product->A; 736a3f881fbSStefano Zampini B = product->B; 737a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 738a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 739a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 740a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 741076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 742076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 743076ba34aSJunchao Zhang 744a3f881fbSStefano Zampini ptype = product->type; 745a3f881fbSStefano Zampini switch (ptype) { 746076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 747076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 748076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 749a3f881fbSStefano Zampini default: 750076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 751a3f881fbSStefano Zampini } 752a3f881fbSStefano Zampini 753076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 754076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 755076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 756076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 757a3f881fbSStefano Zampini 758076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 759076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 760076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 761076ba34aSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 762076ba34aSJunchao Zhang /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 763076ba34aSJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 764076ba34aSJunchao Zhang #endif 765076ba34aSJunchao Zhang #endif 766076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 767076ba34aSJunchao Zhang 768076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 769076ba34aSJunchao Zhang if (transA) { 770076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 771076ba34aSJunchao Zhang transA = false; 772076ba34aSJunchao Zhang } 773076ba34aSJunchao Zhang 774076ba34aSJunchao Zhang if (transB) { 775076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 776076ba34aSJunchao Zhang transB = false; 777076ba34aSJunchao Zhang } 778076ba34aSJunchao Zhang 779076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 780076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 781076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 782076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 783076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 784076ba34aSJunchao Zhang */ 785076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 786076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 787076ba34aSJunchao Zhang 788076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 789076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 790076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 791a3f881fbSStefano Zampini PetscFunctionReturn(0); 792a3f881fbSStefano Zampini } 793a3f881fbSStefano Zampini 794a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 795076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 796a3f881fbSStefano Zampini { 797a3f881fbSStefano Zampini PetscErrorCode ierr; 798076ba34aSJunchao Zhang Mat_Product *product = mat->product; 799a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 800a3f881fbSStefano Zampini 801a3f881fbSStefano Zampini PetscFunctionBegin; 802a3f881fbSStefano Zampini MatCheckProduct(mat,1); 803a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 804a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 805a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 806a3f881fbSStefano Zampini } 807a3f881fbSStefano Zampini if (Biskok && Ciskok) { 808a3f881fbSStefano Zampini switch (product->type) { 809a3f881fbSStefano Zampini case MATPRODUCT_AB: 810a3f881fbSStefano Zampini case MATPRODUCT_AtB: 811a3f881fbSStefano Zampini case MATPRODUCT_ABt: 812a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 813a3f881fbSStefano Zampini break; 814a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 815a3f881fbSStefano Zampini case MATPRODUCT_RARt: 816a3f881fbSStefano Zampini case MATPRODUCT_ABC: 817a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 818a3f881fbSStefano Zampini break; 819a3f881fbSStefano Zampini default: 820a3f881fbSStefano Zampini break; 821a3f881fbSStefano Zampini } 822a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 823a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 824a3f881fbSStefano Zampini } 825a3f881fbSStefano Zampini PetscFunctionReturn(0); 826a3f881fbSStefano Zampini } 827a587d139SMark 828f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 829f0cf5187SStefano Zampini { 830f0cf5187SStefano Zampini PetscErrorCode ierr; 831f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 832f0cf5187SStefano Zampini 833f0cf5187SStefano Zampini PetscFunctionBegin; 834f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 835f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 836076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 837076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 838f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 839076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 840f0cf5187SStefano Zampini PetscFunctionReturn(0); 841f0cf5187SStefano Zampini } 842f0cf5187SStefano Zampini 843a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 844a587d139SMark { 845a587d139SMark PetscErrorCode ierr; 846076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 847a587d139SMark 848a587d139SMark PetscFunctionBegin; 849076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 850076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 851076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 852076ba34aSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 853076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 854a587d139SMark PetscFunctionReturn(0); 855a587d139SMark } 856a587d139SMark 857db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 858db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 859db78de30SJunchao Zhang { 860db78de30SJunchao Zhang PetscErrorCode ierr; 861db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 862db78de30SJunchao Zhang 863db78de30SJunchao Zhang PetscFunctionBegin; 864db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 865db78de30SJunchao Zhang PetscValidPointer(kv,2); 866db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 867db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 868db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 869076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 870db78de30SJunchao Zhang PetscFunctionReturn(0); 871db78de30SJunchao Zhang } 872db78de30SJunchao Zhang 873db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 874db78de30SJunchao Zhang { 875db78de30SJunchao Zhang PetscFunctionBegin; 876db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 877db78de30SJunchao Zhang PetscValidPointer(kv,2); 878db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 879db78de30SJunchao Zhang PetscFunctionReturn(0); 880db78de30SJunchao Zhang } 881db78de30SJunchao Zhang 882db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 883db78de30SJunchao Zhang { 884db78de30SJunchao Zhang PetscErrorCode ierr; 885db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 886db78de30SJunchao Zhang 887db78de30SJunchao Zhang PetscFunctionBegin; 888db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 889db78de30SJunchao Zhang PetscValidPointer(kv,2); 890db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 891db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 892db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 893076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 894db78de30SJunchao Zhang PetscFunctionReturn(0); 895db78de30SJunchao Zhang } 896db78de30SJunchao Zhang 897db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 898db78de30SJunchao Zhang { 899db78de30SJunchao Zhang PetscErrorCode ierr; 900db78de30SJunchao Zhang 901db78de30SJunchao Zhang PetscFunctionBegin; 902db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 903db78de30SJunchao Zhang PetscValidPointer(kv,2); 904db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 905076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 906db78de30SJunchao Zhang PetscFunctionReturn(0); 907db78de30SJunchao Zhang } 908db78de30SJunchao Zhang 909db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 910db78de30SJunchao Zhang { 911db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 912db78de30SJunchao Zhang 913db78de30SJunchao Zhang PetscFunctionBegin; 914db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 915db78de30SJunchao Zhang PetscValidPointer(kv,2); 916db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 917db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 918076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 919db78de30SJunchao Zhang PetscFunctionReturn(0); 920db78de30SJunchao Zhang } 921db78de30SJunchao Zhang 922db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 923db78de30SJunchao Zhang { 924db78de30SJunchao Zhang PetscErrorCode ierr; 925db78de30SJunchao Zhang 926db78de30SJunchao Zhang PetscFunctionBegin; 927db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 928db78de30SJunchao Zhang PetscValidPointer(kv,2); 929db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 930076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 931db78de30SJunchao Zhang PetscFunctionReturn(0); 932db78de30SJunchao Zhang } 933db78de30SJunchao Zhang 934c17cf699SJunchao Zhang /* Computes Y += alpha X */ 935c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 936a587d139SMark { 937a587d139SMark PetscErrorCode ierr; 938a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 939c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 940c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 941c17cf699SJunchao Zhang MatScalarKokkosView Ya; 942a587d139SMark 943a587d139SMark PetscFunctionBegin; 944c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 945c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 946c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 947c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 948db78de30SJunchao Zhang 949c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 950c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 951a587d139SMark PetscBool e; 952a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 953a587d139SMark if (e) { 954a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 955c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 956a587d139SMark } 957a587d139SMark } 958db78de30SJunchao Zhang 959c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 960c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 961c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 962c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 963c17cf699SJunchao Zhang */ 964c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 965c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 966c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 967c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 968c17cf699SJunchao Zhang 969c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 970c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 971c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 972c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 973c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 974c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 975c17cf699SJunchao Zhang 976c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 977c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 978c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 979c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 980c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 981c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 982c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 983c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 984c17cf699SJunchao Zhang q++; 985a587d139SMark } else { 986c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 987c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 988c17cf699SJunchao Zhang */ 989c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 990a587d139SMark } 991c17cf699SJunchao Zhang } 992c17cf699SJunchao Zhang }); 993c17cf699SJunchao Zhang }); 994c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 995c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 996c17cf699SJunchao Zhang Mat Z; 997c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 998c17cf699SJunchao Zhang KernelHandle kh; 999c17cf699SJunchao Zhang kh.create_spadd_handle(false); 1000c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 1001c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 1002c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 1003c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 1004c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 1005c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 1006c17cf699SJunchao Zhang } 1007c17cf699SJunchao Zhang 1008a587d139SMark PetscFunctionReturn(0); 1009a587d139SMark } 1010a587d139SMark 101186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 10128f7e8f9dSMark Adams { 10138f7e8f9dSMark Adams PetscErrorCode ierr; 10148f7e8f9dSMark Adams 10158f7e8f9dSMark Adams PetscFunctionBegin; 10168f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 10178f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 10188f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 10198f7e8f9dSMark Adams PetscFunctionReturn(0); 10208f7e8f9dSMark Adams } 10218f7e8f9dSMark Adams 10228c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 10238c3ff71bSJunchao Zhang { 1024076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1025076ba34aSJunchao Zhang 10268c3ff71bSJunchao Zhang PetscFunctionBegin; 1027076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 10286f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 10296f3d89d0SStefano Zampini 10308c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10318c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10328c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1033a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1034f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1035a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1036076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 10378c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 10388c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 10398c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 10408c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 10418c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 10428c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1043076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1044*0ecb592aSJunchao Zhang A->ops->transpose = MatTranspose_SeqAIJKokkos; 1045152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1046076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1047076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1048076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1049076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1050076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1051076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1052076ba34aSJunchao Zhang PetscFunctionReturn(0); 1053076ba34aSJunchao Zhang } 1054076ba34aSJunchao Zhang 1055076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1056076ba34aSJunchao Zhang { 1057076ba34aSJunchao Zhang PetscErrorCode ierr; 1058076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1059076ba34aSJunchao Zhang PetscInt i,m,n; 1060076ba34aSJunchao Zhang 1061076ba34aSJunchao Zhang PetscFunctionBegin; 1062076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1063076ba34aSJunchao Zhang 1064076ba34aSJunchao Zhang m = akok->nrows(); 1065076ba34aSJunchao Zhang n = akok->ncols(); 1066076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1067076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1068076ba34aSJunchao Zhang 1069076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1070076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1071076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1072076ba34aSJunchao Zhang 1073076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1074076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1075076ba34aSJunchao Zhang 1076076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1077076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1078076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1079076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1080076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1081076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1082076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1083076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1084076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1085076ba34aSJunchao Zhang 1086076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1087076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1088076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1089076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1090076ba34aSJunchao Zhang } 1091076ba34aSJunchao Zhang 1092076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1093076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1094076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1095076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1096076ba34aSJunchao Zhang A->spptr = akok; 1097076ba34aSJunchao Zhang PetscFunctionReturn(0); 1098076ba34aSJunchao Zhang } 1099076ba34aSJunchao Zhang 1100076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1101076ba34aSJunchao Zhang 1102076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1103076ba34aSJunchao Zhang */ 1104076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1105076ba34aSJunchao Zhang { 1106076ba34aSJunchao Zhang PetscErrorCode ierr; 1107076ba34aSJunchao Zhang 1108076ba34aSJunchao Zhang PetscFunctionBegin; 1109076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1110076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 11118c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11128c3ff71bSJunchao Zhang } 11138c3ff71bSJunchao Zhang 11148c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1115152b3e56SJunchao Zhang /*@C 11168c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 11178c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 11188c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 11198c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 11208c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 11218c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 11228c3ff71bSJunchao Zhang 11238c3ff71bSJunchao Zhang Collective 11248c3ff71bSJunchao Zhang 11258c3ff71bSJunchao Zhang Input Parameters: 11268c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 11278c3ff71bSJunchao Zhang . m - number of rows 11288c3ff71bSJunchao Zhang . n - number of columns 11298c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 11308c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 11318c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11328c3ff71bSJunchao Zhang 11338c3ff71bSJunchao Zhang Output Parameter: 11348c3ff71bSJunchao Zhang . A - the matrix 11358c3ff71bSJunchao Zhang 11368c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 11378c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 11388c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 11398c3ff71bSJunchao Zhang 11408c3ff71bSJunchao Zhang Notes: 11418c3ff71bSJunchao Zhang If nnz is given then nz is ignored 11428c3ff71bSJunchao Zhang 11438c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 11448c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 11458c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 11468c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 11478c3ff71bSJunchao Zhang 11488c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 11498c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11508c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 11518c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 11528c3ff71bSJunchao Zhang 11538c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 11548c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 11558c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 11568c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 11578c3ff71bSJunchao Zhang 11588c3ff71bSJunchao Zhang Level: intermediate 11598c3ff71bSJunchao Zhang 1160076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 11618c3ff71bSJunchao Zhang @*/ 11628c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 11638c3ff71bSJunchao Zhang { 11648c3ff71bSJunchao Zhang PetscErrorCode ierr; 11658c3ff71bSJunchao Zhang 11668c3ff71bSJunchao Zhang PetscFunctionBegin; 11678c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 11688c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 11698c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 11708c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11718c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11728c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11738c3ff71bSJunchao Zhang } 1174930e68a5SMark Adams 11758f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 11768f7e8f9dSMark Adams // 11778f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 11788f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 11798f7e8f9dSMark Adams // 11808f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1181930e68a5SMark Adams { 11828f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 11838f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 11848f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1185930e68a5SMark Adams PetscErrorCode ierr; 11868f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1187076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1188076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1189076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 11908f7e8f9dSMark Adams PetscBool row_identity,col_identity; 11918f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 11928f7e8f9dSMark Adams PetscContainer container; 1193930e68a5SMark Adams 1194930e68a5SMark Adams PetscFunctionBegin; 11958f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 11968f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 11978f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 11988f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 11998f7e8f9dSMark Adams if (container) { 12008f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 12018f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 12028f7e8f9dSMark Adams Nf = (*pNf)%1000; 12038f7e8f9dSMark Adams nv = (*pNf)/1000; 12048f7e8f9dSMark Adams if (nv>0) nVec = nv; 12058f7e8f9dSMark Adams } else Nf = 1; 12068f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 12078f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 12088f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 12098f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 12108f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 12118f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 12128f7e8f9dSMark Adams #endif 12138f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 12148f7e8f9dSMark Adams { 12158f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 12168f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 12178f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 12188f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 12198f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 12208f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 12218f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 12228f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 12238f7e8f9dSMark Adams size_t flops_h = 0.0; 12248f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 12258f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 12268f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 12278f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 12288f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 12298f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 12308f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 12318f7e8f9dSMark Adams // Fill A --> fact 12328f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1233042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12348f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12358f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12368f7e8f9dSMark Adams // zero rows of B 12378f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12388f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12398f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12408f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12418f7e8f9dSMark Adams /* zero (unfactored row) */ 12428f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12438f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12448f7e8f9dSMark Adams }); 12458f7e8f9dSMark Adams // copy A into B 12468f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12478f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12488f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12498f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12508f7e8f9dSMark Adams /* load in initial (unfactored row) */ 12518f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 12528f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 12538f7e8f9dSMark Adams PetscScalar vala = av[j]; 12548f7e8f9dSMark Adams if (colb == rowb) { 12558f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 12568f7e8f9dSMark Adams } else { 12578f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12588f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12598f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 12608f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 12618f7e8f9dSMark Adams if (pbj[j] == colb) { 12628f7e8f9dSMark Adams pba[j] = vala; 12638f7e8f9dSMark Adams set++; 12648f7e8f9dSMark Adams break; 12658f7e8f9dSMark Adams } 12668f7e8f9dSMark Adams } 12678f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 12688f7e8f9dSMark Adams } 12698f7e8f9dSMark Adams } 12708f7e8f9dSMark Adams }); 12718f7e8f9dSMark Adams }); 12728f7e8f9dSMark Adams Kokkos::fence(); 1273930e68a5SMark Adams 12748f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 12758f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12768f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12778f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1278042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 12798f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 12808f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 12818f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 12828f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 12838f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 12848f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 12858f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 12868f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 12878f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 12888f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 12898f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 12908f7e8f9dSMark Adams else { 12918f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 12928f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 12938f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 12948f7e8f9dSMark Adams size_t st_idx; 12958f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 12968f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 12978f7e8f9dSMark Adams // get column, there has got to be a better way 12988f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 12998f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 13008f7e8f9dSMark Adams if (pjL[j] == ii) { 13018f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 13028f7e8f9dSMark Adams idx = j; // output 13038f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 13048f7e8f9dSMark Adams } 13058f7e8f9dSMark Adams }, st_idx); 13068f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 13078f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 13088f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 13098f7e8f9dSMark Adams // U(i+1,:end) 13108f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 13118f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 13128f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 13138f7e8f9dSMark Adams if (col==myk) { 13148f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 13158f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 13168f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 13178f7e8f9dSMark Adams } else { 13188f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 13198f7e8f9dSMark Adams PetscInt high, low; 13208f7e8f9dSMark Adams const PetscInt *startj; 13218f7e8f9dSMark Adams if (col<myk) { // L 13228f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 13238f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 13248f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 13258f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 13268f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 13278f7e8f9dSMark Adams } else { 13288f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 13298f7e8f9dSMark Adams start = ba_d + idx; 13308f7e8f9dSMark Adams startj= bj_d + idx; 13318f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13328f7e8f9dSMark Adams } 13338f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13348f7e8f9dSMark Adams low = 0; 13358f7e8f9dSMark Adams high = (PetscInt)(end-start); 13368f7e8f9dSMark Adams while (high-low > 5) { 13378f7e8f9dSMark Adams int t = (low+high)/2; 13388f7e8f9dSMark Adams if (startj[t] > col) high = t; 13398f7e8f9dSMark Adams else low = t; 13408f7e8f9dSMark Adams } 13418f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13428f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13438f7e8f9dSMark Adams } 13448f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 13458f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 13468f7e8f9dSMark Adams } 13478f7e8f9dSMark Adams }); 13488f7e8f9dSMark Adams } 13498f7e8f9dSMark Adams } 13508f7e8f9dSMark Adams }); 13518f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 13528f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 13538f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 13548f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 13558f7e8f9dSMark Adams }); 13568f7e8f9dSMark Adams Kokkos::fence(); 13578f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 13588f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13598f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13608f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 13618f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13628f7e8f9dSMark Adams #endif 13638f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 13648f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 13658f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 13668f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 13678f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 13688f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 13698f7e8f9dSMark Adams if (i < end) { 13708f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 13718f7e8f9dSMark Adams *pv = 1.0/(*pv); 13728f7e8f9dSMark Adams } 13738f7e8f9dSMark Adams }); 13748f7e8f9dSMark Adams }); 13758f7e8f9dSMark Adams } 13768f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13778f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 13788f7e8f9dSMark Adams #endif 13798f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 13808f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 13818f7e8f9dSMark Adams 13828f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 13838f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 13848f7e8f9dSMark Adams if (b->inode.size) { 13858f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 13868f7e8f9dSMark Adams } else if (row_identity && col_identity) { 13878f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 13888f7e8f9dSMark Adams } else { 13898f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 13908f7e8f9dSMark Adams } 13918f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 13928f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 13938f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 13948f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 13958f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 13968f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 13978f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 13988f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 13998f7e8f9dSMark Adams 1400930e68a5SMark Adams PetscFunctionReturn(0); 1401930e68a5SMark Adams } 1402930e68a5SMark Adams 140386a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1404930e68a5SMark Adams { 1405930e68a5SMark Adams PetscErrorCode ierr; 1406930e68a5SMark Adams 1407930e68a5SMark Adams PetscFunctionBegin; 1408930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 140986a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 141086a27549SJunchao Zhang PetscFunctionReturn(0); 141186a27549SJunchao Zhang } 141286a27549SJunchao Zhang 141386a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 141486a27549SJunchao Zhang { 141586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 141686a27549SJunchao Zhang 141786a27549SJunchao Zhang PetscFunctionBegin; 141886a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 141986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 142086a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 142186a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 142286a27549SJunchao Zhang } 142386a27549SJunchao Zhang PetscFunctionReturn(0); 142486a27549SJunchao Zhang } 142586a27549SJunchao Zhang 142686a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 142786a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 142886a27549SJunchao Zhang { 142986a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1430076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 143186a27549SJunchao Zhang 143286a27549SJunchao Zhang PetscFunctionBegin; 143386a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 143486a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1435076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 143686a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1437076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1438076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 143986a27549SJunchao Zhang 144086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1441076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1442076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1443076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 144486a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 144586a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 144686a27549SJunchao Zhang 144786a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 144886a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 144986a27549SJunchao Zhang */ 1450076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1451076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 145286a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 145386a27549SJunchao Zhang 145486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 145586a27549SJunchao Zhang 145686a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1457076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 145886a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1459076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1460076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 146186a27549SJunchao Zhang 146286a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1463076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1464076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1465076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 146686a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 146786a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 146886a27549SJunchao Zhang 146986a27549SJunchao Zhang /* Sort indices. See comments above */ 1470076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1471076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 147286a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 147386a27549SJunchao Zhang 147486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 147586a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 147686a27549SJunchao Zhang } 147786a27549SJunchao Zhang PetscFunctionReturn(0); 147886a27549SJunchao Zhang } 147986a27549SJunchao Zhang 148086a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 148186a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 148286a27549SJunchao Zhang { 148386a27549SJunchao Zhang PetscErrorCode ierr; 148486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 148586a27549SJunchao Zhang PetscScalarKokkosView xv; 148686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 148786a27549SJunchao Zhang 148886a27549SJunchao Zhang PetscFunctionBegin; 148986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 149086a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 149186a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 149286a27549SJunchao Zhang /* Solve L tmpv = b */ 14933f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 149486a27549SJunchao Zhang /* Solve Ux = tmpv */ 14953f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 149686a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 149786a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 149886a27549SJunchao Zhang PetscFunctionReturn(0); 149986a27549SJunchao Zhang } 150086a27549SJunchao Zhang 1501076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 150286a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 150386a27549SJunchao Zhang { 150486a27549SJunchao Zhang PetscErrorCode ierr; 150586a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 150686a27549SJunchao Zhang PetscScalarKokkosView xv; 150786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 150886a27549SJunchao Zhang 150986a27549SJunchao Zhang PetscFunctionBegin; 151086a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 151186a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 151286a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 151386a27549SJunchao Zhang /* Solve U^T tmpv = b */ 151486a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 151586a27549SJunchao Zhang 151686a27549SJunchao Zhang /* Solve L^T x = tmpv */ 151786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 151886a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 151986a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 152086a27549SJunchao Zhang PetscFunctionReturn(0); 152186a27549SJunchao Zhang } 152286a27549SJunchao Zhang 152386a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 152486a27549SJunchao Zhang { 152586a27549SJunchao Zhang PetscErrorCode ierr; 152686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 152786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 152886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 152986a27549SJunchao Zhang 153086a27549SJunchao Zhang PetscFunctionBegin; 153186a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1532076ba34aSJunchao Zhang 1533076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1534076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1535076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1536076ba34aSJunchao Zhang 1537076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 153886a27549SJunchao Zhang 153986a27549SJunchao Zhang B->assembled = PETSC_TRUE; 154086a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 154186a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 154286a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 154386a27549SJunchao Zhang B->ops->matsolve = NULL; 154486a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 154586a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 154686a27549SJunchao Zhang 154786a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 154886a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 154986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 155086a27549SJunchao Zhang PetscFunctionReturn(0); 155186a27549SJunchao Zhang } 155286a27549SJunchao Zhang 155386a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 155486a27549SJunchao Zhang { 155586a27549SJunchao Zhang PetscErrorCode ierr; 155686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 155786a27549SJunchao Zhang Mat_SeqAIJ *b; 155886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 155986a27549SJunchao Zhang PetscInt fill_lev = info->levels; 156086a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 156186a27549SJunchao Zhang PetscInt n = A->rmap->n; 156286a27549SJunchao Zhang 156386a27549SJunchao Zhang PetscFunctionBegin; 156486a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 156586a27549SJunchao Zhang /* Rebuild factors */ 156686a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 156786a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 156886a27549SJunchao Zhang 156986a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 157086a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 157186a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 157286a27549SJunchao Zhang 157386a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 157486a27549SJunchao Zhang 157586a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 157686a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 157786a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 157886a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 157986a27549SJunchao Zhang 158086a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1581076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1582076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1583076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 158486a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 158586a27549SJunchao Zhang 158686a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 158786a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 158886a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 158986a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 159086a27549SJunchao Zhang 159186a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 159286a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 159386a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 159486a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 159586a27549SJunchao Zhang #else 159686a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 159786a27549SJunchao Zhang #endif 159886a27549SJunchao Zhang 159986a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 160086a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 160186a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 160286a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 160386a27549SJunchao Zhang 160486a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 160586a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 160686a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 160786a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 160886a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 160986a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 161086a27549SJunchao Zhang 161186a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 161286a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1613930e68a5SMark Adams PetscFunctionReturn(0); 1614930e68a5SMark Adams } 1615930e68a5SMark Adams 16168f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 16178f7e8f9dSMark Adams { 16188f7e8f9dSMark Adams PetscErrorCode ierr; 16198f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 16208f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1621930e68a5SMark Adams 16228f7e8f9dSMark Adams PetscFunctionBegin; 16238f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 16248f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 16258f7e8f9dSMark Adams // move B data into Kokkos 16268f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 16278f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 16288f7e8f9dSMark Adams { 16298f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 16308f7e8f9dSMark Adams if (!baijkok->diag_d) { 16318f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1632152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 16338f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 16348f7e8f9dSMark Adams } 16358f7e8f9dSMark Adams } 16368f7e8f9dSMark Adams PetscFunctionReturn(0); 16378f7e8f9dSMark Adams } 16388f7e8f9dSMark Adams 163986a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1640930e68a5SMark Adams { 1641930e68a5SMark Adams PetscFunctionBegin; 1642930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1643930e68a5SMark Adams PetscFunctionReturn(0); 1644930e68a5SMark Adams } 1645930e68a5SMark Adams 16468f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16478f7e8f9dSMark Adams { 16488f7e8f9dSMark Adams PetscFunctionBegin; 16498f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16508f7e8f9dSMark Adams PetscFunctionReturn(0); 16518f7e8f9dSMark Adams } 16528f7e8f9dSMark Adams 1653930e68a5SMark Adams /*MC 165486a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 165586a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1656930e68a5SMark Adams 1657930e68a5SMark Adams Level: beginner 1658930e68a5SMark Adams 165986a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1660930e68a5SMark Adams M*/ 166186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1662930e68a5SMark Adams { 1663930e68a5SMark Adams PetscErrorCode ierr; 1664930e68a5SMark Adams PetscInt n = A->rmap->n; 1665930e68a5SMark Adams 1666930e68a5SMark Adams PetscFunctionBegin; 1667930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1668930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1669930e68a5SMark Adams (*B)->factortype = ftype; 16704ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1671930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1672930e68a5SMark Adams 16738f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1674930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167586a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 167686a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 167786a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 167886a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 167986a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 168086a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 168186a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1682930e68a5SMark Adams 1683930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 168486a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1685930e68a5SMark Adams PetscFunctionReturn(0); 1686930e68a5SMark Adams } 16878f7e8f9dSMark Adams 16888f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 16898f7e8f9dSMark Adams { 16908f7e8f9dSMark Adams PetscErrorCode ierr; 16918f7e8f9dSMark Adams PetscInt n = A->rmap->n; 16928f7e8f9dSMark Adams 16938f7e8f9dSMark Adams PetscFunctionBegin; 16948f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 16958f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 16968f7e8f9dSMark Adams (*B)->factortype = ftype; 1697f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 16984ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 16998f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 17008f7e8f9dSMark Adams 17018f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 17028f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 17038f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 17048f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 17058f7e8f9dSMark Adams 17068f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 17078f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 17088f7e8f9dSMark Adams PetscFunctionReturn(0); 17098f7e8f9dSMark Adams } 171086a27549SJunchao Zhang 171186a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 171286a27549SJunchao Zhang { 171386a27549SJunchao Zhang PetscErrorCode ierr; 171486a27549SJunchao Zhang 171586a27549SJunchao Zhang PetscFunctionBegin; 171686a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 171786a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 171886a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 171986a27549SJunchao Zhang PetscFunctionReturn(0); 172086a27549SJunchao Zhang } 172186a27549SJunchao Zhang 1722076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1723076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1724076ba34aSJunchao Zhang { 1725076ba34aSJunchao Zhang PetscErrorCode ierr; 1726076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1727076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1728076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1729076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1730076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1731076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1732076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1733076ba34aSJunchao Zhang 1734076ba34aSJunchao Zhang PetscFunctionBegin; 1735076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1736076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1737076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1738076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1739076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1740076ba34aSJunchao Zhang } 1741076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1742076ba34aSJunchao Zhang } 1743076ba34aSJunchao Zhang PetscFunctionReturn(0); 1744076ba34aSJunchao Zhang } 1745