111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 48c3ff71bSJunchao Zhang #include <petscsystypes.h> 58c3ff71bSJunchao Zhang #include <petscerror.h> 68c3ff71bSJunchao Zhang 78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 8f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 9076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35076ba34aSJunchao Zhang 36076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38076ba34aSJunchao Zhang 39076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42076ba34aSJunchao Zhang */ 43076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44076ba34aSJunchao Zhang delete aijkok; 45076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46076ba34aSJunchao Zhang A->spptr = aijkok; 47076ba34aSJunchao Zhang } 48076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 6186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 7986a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 8486a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8586a27549SJunchao Zhang PetscFunctionReturn(0); 8686a27549SJunchao Zhang } 8786a27549SJunchao Zhang 88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89f0cf5187SStefano Zampini { 90f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91f0cf5187SStefano Zampini 92f0cf5187SStefano Zampini PetscFunctionBegin; 93f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9586a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 98f0cf5187SStefano Zampini PetscFunctionReturn(0); 99f0cf5187SStefano Zampini } 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102f0cf5187SStefano Zampini { 103076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104f0cf5187SStefano Zampini 105f0cf5187SStefano Zampini PetscFunctionBegin; 106076ba34aSJunchao Zhang if (aijkok) { 107076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 108076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 109076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 110076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111076ba34aSJunchao Zhang } 112076ba34aSJunchao Zhang PetscFunctionReturn(0); 113076ba34aSJunchao Zhang } 114076ba34aSJunchao Zhang 115076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116076ba34aSJunchao Zhang { 117076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118076ba34aSJunchao Zhang 119076ba34aSJunchao Zhang PetscFunctionBegin; 120076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 121076ba34aSJunchao Zhang PetscFunctionReturn(0); 122076ba34aSJunchao Zhang } 123076ba34aSJunchao Zhang 124076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125076ba34aSJunchao Zhang { 126076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127076ba34aSJunchao Zhang 128076ba34aSJunchao Zhang PetscFunctionBegin; 129076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 130076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 131076ba34aSJunchao Zhang PetscFunctionReturn(0); 132076ba34aSJunchao Zhang } 133076ba34aSJunchao Zhang 134076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135076ba34aSJunchao Zhang { 136076ba34aSJunchao Zhang PetscFunctionBegin; 137076ba34aSJunchao Zhang *array = NULL; 138076ba34aSJunchao Zhang PetscFunctionReturn(0); 139076ba34aSJunchao Zhang } 140076ba34aSJunchao Zhang 141076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142076ba34aSJunchao Zhang { 143076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144076ba34aSJunchao Zhang 145076ba34aSJunchao Zhang PetscFunctionBegin; 146076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 147076ba34aSJunchao Zhang PetscFunctionReturn(0); 148076ba34aSJunchao Zhang } 149076ba34aSJunchao Zhang 150076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151076ba34aSJunchao Zhang { 152076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153076ba34aSJunchao Zhang 154076ba34aSJunchao Zhang PetscFunctionBegin; 155076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 156076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 157f0cf5187SStefano Zampini PetscFunctionReturn(0); 158f0cf5187SStefano Zampini } 159f0cf5187SStefano Zampini 160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162a587d139SMark { 163042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165a587d139SMark 166a587d139SMark PetscFunctionBegin; 167076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170a587d139SMark PetscFunctionReturn(0); 171a587d139SMark } 172a587d139SMark 173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175a587d139SMark { 176042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177a587d139SMark 178a587d139SMark PetscFunctionBegin; 179a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 180a587d139SMark *d_mat = aijkok->device_mat_d.data(); 181a587d139SMark } else { 182a587d139SMark PetscErrorCode ierr; 183a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184a587d139SMark *d_mat = NULL; 185a587d139SMark } 186a587d139SMark PetscFunctionReturn(0); 187a587d139SMark } 188076ba34aSJunchao Zhang 189076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 190076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191152b3e56SJunchao Zhang { 192152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193152b3e56SJunchao Zhang 194152b3e56SJunchao Zhang PetscFunctionBegin; 195076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 199076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 20186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 202076ba34aSJunchao Zhang } 203076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 204152b3e56SJunchao Zhang PetscFunctionReturn(0); 205152b3e56SJunchao Zhang } 206152b3e56SJunchao Zhang 207076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 208076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209152b3e56SJunchao Zhang { 210152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211152b3e56SJunchao Zhang 212152b3e56SJunchao Zhang PetscFunctionBegin; 213076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 216076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 219076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 220076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221076ba34aSJunchao Zhang #endif 22286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 223076ba34aSJunchao Zhang } 224076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 225152b3e56SJunchao Zhang PetscFunctionReturn(0); 226152b3e56SJunchao Zhang } 227a587d139SMark 2288c3ff71bSJunchao Zhang /* y = A x */ 2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2308c3ff71bSJunchao Zhang { 2318c3ff71bSJunchao Zhang PetscErrorCode ierr; 2328c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 233152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 234152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2358c3ff71bSJunchao Zhang 2368c3ff71bSJunchao Zhang PetscFunctionBegin; 2378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2408c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 245076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2478c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2488c3ff71bSJunchao Zhang } 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang /* y = A^T x */ 2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2528c3ff71bSJunchao Zhang { 2538c3ff71bSJunchao Zhang PetscErrorCode ierr; 2548c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 255152b3e56SJunchao Zhang const char *mode; 256152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 257152b3e56SJunchao Zhang PetscScalarKokkosView yv; 258076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2598c3ff71bSJunchao Zhang 2608c3ff71bSJunchao Zhang PetscFunctionBegin; 2618c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 265076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266152b3e56SJunchao Zhang mode = "N"; 267152b3e56SJunchao Zhang } else { 268076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 270152b3e56SJunchao Zhang mode = "T"; 271152b3e56SJunchao Zhang } 272076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 276076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2788c3ff71bSJunchao Zhang } 2798c3ff71bSJunchao Zhang 2808c3ff71bSJunchao Zhang /* y = A^H x */ 2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2828c3ff71bSJunchao Zhang { 2838c3ff71bSJunchao Zhang PetscErrorCode ierr; 2848c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 285152b3e56SJunchao Zhang const char *mode; 286152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 287152b3e56SJunchao Zhang PetscScalarKokkosView yv; 288076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang PetscFunctionBegin; 2918c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 295076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296152b3e56SJunchao Zhang mode = "N"; 297152b3e56SJunchao Zhang } else { 298076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 300152b3e56SJunchao Zhang mode = "C"; 301152b3e56SJunchao Zhang } 302076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 306076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3078c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3088c3ff71bSJunchao Zhang } 3098c3ff71bSJunchao Zhang 3108c3ff71bSJunchao Zhang /* z = A x + y */ 3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3128c3ff71bSJunchao Zhang { 3138c3ff71bSJunchao Zhang PetscErrorCode ierr; 3148c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 315152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 316152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3178c3ff71bSJunchao Zhang 3188c3ff71bSJunchao Zhang PetscFunctionBegin; 3198c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3238c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3248c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 330152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3318c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3328c3ff71bSJunchao Zhang } 3338c3ff71bSJunchao Zhang 3348c3ff71bSJunchao Zhang /* z = A^T x + y */ 3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3368c3ff71bSJunchao Zhang { 3378c3ff71bSJunchao Zhang PetscErrorCode ierr; 3388c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 339152b3e56SJunchao Zhang const char *mode; 340152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 341152b3e56SJunchao Zhang PetscScalarKokkosView zv; 342076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3438c3ff71bSJunchao Zhang 3448c3ff71bSJunchao Zhang PetscFunctionBegin; 3458c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 350152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 351076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352152b3e56SJunchao Zhang mode = "N"; 353152b3e56SJunchao Zhang } else { 354076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 356152b3e56SJunchao Zhang mode = "T"; 357152b3e56SJunchao Zhang } 358076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 363076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3658c3ff71bSJunchao Zhang } 3668c3ff71bSJunchao Zhang 3678c3ff71bSJunchao Zhang /* z = A^H x + y */ 3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3698c3ff71bSJunchao Zhang { 3708c3ff71bSJunchao Zhang PetscErrorCode ierr; 3718c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 372152b3e56SJunchao Zhang const char *mode; 373152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 374152b3e56SJunchao Zhang PetscScalarKokkosView zv; 375076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3768c3ff71bSJunchao Zhang 3778c3ff71bSJunchao Zhang PetscFunctionBegin; 3788c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 383152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 384076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385152b3e56SJunchao Zhang mode = "N"; 386152b3e56SJunchao Zhang } else { 387076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 389152b3e56SJunchao Zhang mode = "C"; 390152b3e56SJunchao Zhang } 391076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 396076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397152b3e56SJunchao Zhang PetscFunctionReturn(0); 398152b3e56SJunchao Zhang } 399152b3e56SJunchao Zhang 400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401152b3e56SJunchao Zhang { 402152b3e56SJunchao Zhang PetscErrorCode ierr; 403152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404152b3e56SJunchao Zhang 405152b3e56SJunchao Zhang PetscFunctionBegin; 406152b3e56SJunchao Zhang switch (op) { 407152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 408152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 411152b3e56SJunchao Zhang break; 412152b3e56SJunchao Zhang default: 413152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414152b3e56SJunchao Zhang break; 415152b3e56SJunchao Zhang } 4168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4178c3ff71bSJunchao Zhang } 4188c3ff71bSJunchao Zhang 419076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4218c3ff71bSJunchao Zhang { 4228c3ff71bSJunchao Zhang PetscErrorCode ierr; 423076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4248c3ff71bSJunchao Zhang 4258c3ff71bSJunchao Zhang PetscFunctionBegin; 426a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4298c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 438076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4418c3ff71bSJunchao Zhang } 442076ba34aSJunchao Zhang } 4438c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4448c3ff71bSJunchao Zhang } 4458c3ff71bSJunchao Zhang 446076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448076ba34aSJunchao Zhang */ 449076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4508c3ff71bSJunchao Zhang { 4518c3ff71bSJunchao Zhang PetscErrorCode ierr; 452076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 453076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454076ba34aSJunchao Zhang Mat mat; 4558c3ff71bSJunchao Zhang 4568c3ff71bSJunchao Zhang PetscFunctionBegin; 457076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459076ba34aSJunchao Zhang mat = *B; 460076ba34aSJunchao Zhang if (A->assembled) { 461076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464076ba34aSJunchao Zhang /* Now copy values to B if needed */ 465076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 466076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 467076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 469076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 470076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 472076ba34aSJunchao Zhang } 473076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 476076ba34aSJunchao Zhang } 477076ba34aSJunchao Zhang mat->spptr = bkok; 478076ba34aSJunchao Zhang } 479076ba34aSJunchao Zhang 480076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 4848c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4858c3ff71bSJunchao Zhang } 4868c3ff71bSJunchao Zhang 4878c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 4888c3ff71bSJunchao Zhang { 4898c3ff71bSJunchao Zhang PetscErrorCode ierr; 49086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 4918c3ff71bSJunchao Zhang 4928c3ff71bSJunchao Zhang PetscFunctionBegin; 49386a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 49486a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 4958f7e8f9dSMark Adams if (aijkok) { 4968f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 497a587d139SMark delete aijkok->colmap_d; 498a587d139SMark delete aijkok->i_uncompressed_d; 499a587d139SMark } 5008f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5018f7e8f9dSMark Adams } 5028c3ff71bSJunchao Zhang delete aijkok; 50386a27549SJunchao Zhang } else { 50486a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 50586a27549SJunchao Zhang } 506152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 507152b3e56SJunchao Zhang A->spptr = NULL; 5088c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5098c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5108c3ff71bSJunchao Zhang } 5118c3ff71bSJunchao Zhang 51286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 51386a27549SJunchao Zhang { 51486a27549SJunchao Zhang PetscErrorCode ierr; 51586a27549SJunchao Zhang 51686a27549SJunchao Zhang PetscFunctionBegin; 51786a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 51886a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 51986a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 52086a27549SJunchao Zhang PetscFunctionReturn(0); 52186a27549SJunchao Zhang } 52286a27549SJunchao Zhang 523076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 524076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 525a3f881fbSStefano Zampini { 526076ba34aSJunchao Zhang PetscErrorCode ierr; 527076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 528076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 529076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 530076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 531076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 532076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 533a3f881fbSStefano Zampini 534a3f881fbSStefano Zampini PetscFunctionBegin; 535076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 536076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 537076ba34aSJunchao Zhang PetscValidPointer(C,4); 538076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 539076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 540076ba34aSJunchao Zhang if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 541076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 542076ba34aSJunchao Zhang 543076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 544076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 545076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 546076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 547076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 549076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 550076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 551076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 552076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 553076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 554076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 555076ba34aSJunchao Zhang nnz = a->nz + b->nz; 556076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 557076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 558076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 559076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 560076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 561076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 562076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 563076ba34aSJunchao Zhang ca = ca_dual.view_device(); 564076ba34aSJunchao Zhang ci = ci_dual.view_device(); 565076ba34aSJunchao Zhang cj = cj_dual.view_device(); 566076ba34aSJunchao Zhang 567076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 568076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 569076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 570076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 571076ba34aSJunchao Zhang 572076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 573076ba34aSJunchao Zhang ci(i) = coffset; 574076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 575076ba34aSJunchao Zhang }); 576076ba34aSJunchao Zhang 577076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 578076ba34aSJunchao Zhang if (k < alen) { 579076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 580076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 581076ba34aSJunchao Zhang } else { 582076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 583076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 584076ba34aSJunchao Zhang } 585076ba34aSJunchao Zhang }); 586076ba34aSJunchao Zhang }); 587076ba34aSJunchao Zhang ca_dual.modify_device(); 588076ba34aSJunchao Zhang ci_dual.modify_device(); 589076ba34aSJunchao Zhang cj_dual.modify_device(); 590076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 591076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 592076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 593076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 594076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 595076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 596076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 597076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 598076ba34aSJunchao Zhang 599076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 601076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 603076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 604076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 605076ba34aSJunchao Zhang }); 606076ba34aSJunchao Zhang }); 607076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 608076ba34aSJunchao Zhang } 609076ba34aSJunchao Zhang PetscFunctionReturn(0); 610076ba34aSJunchao Zhang } 611076ba34aSJunchao Zhang 612076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 613076ba34aSJunchao Zhang { 614076ba34aSJunchao Zhang PetscFunctionBegin; 615076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 616a3f881fbSStefano Zampini PetscFunctionReturn(0); 617a3f881fbSStefano Zampini } 618a3f881fbSStefano Zampini 619a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 620a3f881fbSStefano Zampini { 621076ba34aSJunchao Zhang PetscErrorCode ierr; 622a3f881fbSStefano Zampini Mat_Product *product = C->product; 623a3f881fbSStefano Zampini Mat A,B; 624076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 625a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 626a3f881fbSStefano Zampini Mat_SeqAIJ *c; 627076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 628076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 629a3f881fbSStefano Zampini 630a3f881fbSStefano Zampini PetscFunctionBegin; 631a3f881fbSStefano Zampini MatCheckProduct(C,1); 632a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 633076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 634076ba34aSJunchao Zhang 635076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 636076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 637076ba34aSJunchao Zhang PetscFunctionReturn(0); 638076ba34aSJunchao Zhang } 639076ba34aSJunchao Zhang 640076ba34aSJunchao Zhang switch (product->type) { 641076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 642076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 643076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 644076ba34aSJunchao Zhang default: 645076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 646076ba34aSJunchao Zhang } 647076ba34aSJunchao Zhang 648a3f881fbSStefano Zampini A = product->A; 649a3f881fbSStefano Zampini B = product->B; 650a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 651a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 652a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 653a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 654a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 655076ba34aSJunchao Zhang 656076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 657076ba34aSJunchao Zhang 658076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 659076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 660076ba34aSJunchao Zhang 661076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 662076ba34aSJunchao Zhang if (transA) { 663076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 664076ba34aSJunchao Zhang transA = false; 665a3f881fbSStefano Zampini } 666a3f881fbSStefano Zampini 667076ba34aSJunchao Zhang if (transB) { 668076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 669076ba34aSJunchao Zhang transB = false; 670076ba34aSJunchao Zhang } 671076ba34aSJunchao Zhang 672076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 673076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 674076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 675a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 676a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 677a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 678a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 679a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 680a3f881fbSStefano Zampini c->reallocs = 0; 681076ba34aSJunchao Zhang C->info.mallocs = 0; 682a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 683a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 684a3f881fbSStefano Zampini C->num_ass++; 685a3f881fbSStefano Zampini PetscFunctionReturn(0); 686a3f881fbSStefano Zampini } 687a3f881fbSStefano Zampini 688a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 689a3f881fbSStefano Zampini { 690a3f881fbSStefano Zampini PetscErrorCode ierr; 691076ba34aSJunchao Zhang Mat_Product *product = C->product; 692076ba34aSJunchao Zhang MatProductType ptype; 693076ba34aSJunchao Zhang Mat A,B; 694076ba34aSJunchao Zhang bool transA,transB; 695076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 696076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 697076ba34aSJunchao Zhang MPI_Comm comm; 698076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 699a3f881fbSStefano Zampini 700a3f881fbSStefano Zampini PetscFunctionBegin; 701a3f881fbSStefano Zampini MatCheckProduct(C,1); 702076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 703076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 704a3f881fbSStefano Zampini A = product->A; 705a3f881fbSStefano Zampini B = product->B; 706a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 708a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 709a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 710076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 711076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 712076ba34aSJunchao Zhang 713a3f881fbSStefano Zampini ptype = product->type; 714a3f881fbSStefano Zampini switch (ptype) { 715076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 716076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 717076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 718a3f881fbSStefano Zampini default: 719076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 720a3f881fbSStefano Zampini } 721a3f881fbSStefano Zampini 722076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 723076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 724076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 725076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 726a3f881fbSStefano Zampini 727076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 728076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 729076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 730076ba34aSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 731076ba34aSJunchao Zhang /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 732076ba34aSJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 733076ba34aSJunchao Zhang #endif 734076ba34aSJunchao Zhang #endif 735076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 736076ba34aSJunchao Zhang 737076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 738076ba34aSJunchao Zhang if (transA) { 739076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 740076ba34aSJunchao Zhang transA = false; 741076ba34aSJunchao Zhang } 742076ba34aSJunchao Zhang 743076ba34aSJunchao Zhang if (transB) { 744076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 745076ba34aSJunchao Zhang transB = false; 746076ba34aSJunchao Zhang } 747076ba34aSJunchao Zhang 748076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 749076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 750076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 751076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 752076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 753076ba34aSJunchao Zhang */ 754076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 755076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 756076ba34aSJunchao Zhang 757076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 758076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 759076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 760a3f881fbSStefano Zampini PetscFunctionReturn(0); 761a3f881fbSStefano Zampini } 762a3f881fbSStefano Zampini 763a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 764076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 765a3f881fbSStefano Zampini { 766a3f881fbSStefano Zampini PetscErrorCode ierr; 767076ba34aSJunchao Zhang Mat_Product *product = mat->product; 768a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 769a3f881fbSStefano Zampini 770a3f881fbSStefano Zampini PetscFunctionBegin; 771a3f881fbSStefano Zampini MatCheckProduct(mat,1); 772a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 773a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 774a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 775a3f881fbSStefano Zampini } 776a3f881fbSStefano Zampini if (Biskok && Ciskok) { 777a3f881fbSStefano Zampini switch (product->type) { 778a3f881fbSStefano Zampini case MATPRODUCT_AB: 779a3f881fbSStefano Zampini case MATPRODUCT_AtB: 780a3f881fbSStefano Zampini case MATPRODUCT_ABt: 781a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 782a3f881fbSStefano Zampini break; 783a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 784a3f881fbSStefano Zampini case MATPRODUCT_RARt: 785a3f881fbSStefano Zampini case MATPRODUCT_ABC: 786a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 787a3f881fbSStefano Zampini break; 788a3f881fbSStefano Zampini default: 789a3f881fbSStefano Zampini break; 790a3f881fbSStefano Zampini } 791a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 792a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 793a3f881fbSStefano Zampini } 794a3f881fbSStefano Zampini PetscFunctionReturn(0); 795a3f881fbSStefano Zampini } 796a587d139SMark 797f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 798f0cf5187SStefano Zampini { 799f0cf5187SStefano Zampini PetscErrorCode ierr; 800f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 801f0cf5187SStefano Zampini 802f0cf5187SStefano Zampini PetscFunctionBegin; 803f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 804f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 805076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 806076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 807f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 808076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 809f0cf5187SStefano Zampini PetscFunctionReturn(0); 810f0cf5187SStefano Zampini } 811f0cf5187SStefano Zampini 812a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 813a587d139SMark { 814a587d139SMark PetscErrorCode ierr; 815076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 816a587d139SMark 817a587d139SMark PetscFunctionBegin; 818076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 819076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 820076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 821076ba34aSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 822076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 823a587d139SMark PetscFunctionReturn(0); 824a587d139SMark } 825a587d139SMark 826db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 827db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 828db78de30SJunchao Zhang { 829db78de30SJunchao Zhang PetscErrorCode ierr; 830db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 831db78de30SJunchao Zhang 832db78de30SJunchao Zhang PetscFunctionBegin; 833db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 834db78de30SJunchao Zhang PetscValidPointer(kv,2); 835db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 836db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 837db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 838076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 839db78de30SJunchao Zhang PetscFunctionReturn(0); 840db78de30SJunchao Zhang } 841db78de30SJunchao Zhang 842db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 843db78de30SJunchao Zhang { 844db78de30SJunchao Zhang PetscFunctionBegin; 845db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 846db78de30SJunchao Zhang PetscValidPointer(kv,2); 847db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 848db78de30SJunchao Zhang PetscFunctionReturn(0); 849db78de30SJunchao Zhang } 850db78de30SJunchao Zhang 851db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 852db78de30SJunchao Zhang { 853db78de30SJunchao Zhang PetscErrorCode ierr; 854db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 855db78de30SJunchao Zhang 856db78de30SJunchao Zhang PetscFunctionBegin; 857db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 858db78de30SJunchao Zhang PetscValidPointer(kv,2); 859db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 860db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 861db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 862076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 863db78de30SJunchao Zhang PetscFunctionReturn(0); 864db78de30SJunchao Zhang } 865db78de30SJunchao Zhang 866db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 867db78de30SJunchao Zhang { 868db78de30SJunchao Zhang PetscErrorCode ierr; 869db78de30SJunchao Zhang 870db78de30SJunchao Zhang PetscFunctionBegin; 871db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 872db78de30SJunchao Zhang PetscValidPointer(kv,2); 873db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 874076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 875db78de30SJunchao Zhang PetscFunctionReturn(0); 876db78de30SJunchao Zhang } 877db78de30SJunchao Zhang 878db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 879db78de30SJunchao Zhang { 880db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 881db78de30SJunchao Zhang 882db78de30SJunchao Zhang PetscFunctionBegin; 883db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 884db78de30SJunchao Zhang PetscValidPointer(kv,2); 885db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 886db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 888db78de30SJunchao Zhang PetscFunctionReturn(0); 889db78de30SJunchao Zhang } 890db78de30SJunchao Zhang 891db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 892db78de30SJunchao Zhang { 893db78de30SJunchao Zhang PetscErrorCode ierr; 894db78de30SJunchao Zhang 895db78de30SJunchao Zhang PetscFunctionBegin; 896db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 897db78de30SJunchao Zhang PetscValidPointer(kv,2); 898db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 899076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 900db78de30SJunchao Zhang PetscFunctionReturn(0); 901db78de30SJunchao Zhang } 902db78de30SJunchao Zhang 903*c17cf699SJunchao Zhang /* Computes Y += alpha X */ 904*c17cf699SJunchao Zhang static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern) 905a587d139SMark { 906a587d139SMark PetscErrorCode ierr; 907a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 908*c17cf699SJunchao Zhang Mat_SeqAIJKokkos *xkok,*ykok,*zkok; 909*c17cf699SJunchao Zhang ConstMatScalarKokkosView Xa; 910*c17cf699SJunchao Zhang MatScalarKokkosView Ya; 911a587d139SMark 912a587d139SMark PetscFunctionBegin; 913*c17cf699SJunchao Zhang PetscCheckTypeName(Y,MATSEQAIJKOKKOS); 914*c17cf699SJunchao Zhang PetscCheckTypeName(X,MATSEQAIJKOKKOS); 915*c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(Y);CHKERRQ(ierr); 916*c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(X);CHKERRQ(ierr); 917db78de30SJunchao Zhang 918*c17cf699SJunchao Zhang if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) { 919*c17cf699SJunchao Zhang /* We could compare on device, but have to get the comparison result on host. So compare on host instead. */ 920a587d139SMark PetscBool e; 921a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 922a587d139SMark if (e) { 923a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 924*c17cf699SJunchao Zhang if (e) pattern = SAME_NONZERO_PATTERN; 925a587d139SMark } 926a587d139SMark } 927db78de30SJunchao Zhang 928*c17cf699SJunchao Zhang /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip 929*c17cf699SJunchao Zhang cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2(). 930*c17cf699SJunchao Zhang If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However, 931*c17cf699SJunchao Zhang KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not. 932*c17cf699SJunchao Zhang */ 933*c17cf699SJunchao Zhang ykok = static_cast<Mat_SeqAIJKokkos*>(Y->spptr); 934*c17cf699SJunchao Zhang xkok = static_cast<Mat_SeqAIJKokkos*>(X->spptr); 935*c17cf699SJunchao Zhang Xa = xkok->a_dual.view_device(); 936*c17cf699SJunchao Zhang Ya = ykok->a_dual.view_device(); 937*c17cf699SJunchao Zhang 938*c17cf699SJunchao Zhang if (pattern == SAME_NONZERO_PATTERN) { 939*c17cf699SJunchao Zhang KokkosBlas::axpy(alpha,Xa,Ya); 940*c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 941*c17cf699SJunchao Zhang } else if (pattern == SUBSET_NONZERO_PATTERN) { 942*c17cf699SJunchao Zhang MatRowMapKokkosView Xi = xkok->i_dual.view_device(),Yi = ykok->i_dual.view_device(); 943*c17cf699SJunchao Zhang MatColIdxKokkosView Xj = xkok->j_dual.view_device(),Yj = ykok->j_dual.view_device(); 944*c17cf699SJunchao Zhang 945*c17cf699SJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(Y->rmap->n, 1),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 946*c17cf699SJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 947*c17cf699SJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=] () { /* Only one thread works in a team */ 948*c17cf699SJunchao Zhang PetscInt p,q = Yi(i); 949*c17cf699SJunchao Zhang for (p=Xi(i); p<Xi(i+1); p++) { /* For each nonzero on row i of X */ 950*c17cf699SJunchao Zhang while (Xj(p) != Yj(q) && q < Yi(i+1)) q++; /* find the matching nonzero on row i of Y */ 951*c17cf699SJunchao Zhang if (Xj(p) == Yj(q)) { /* Found it */ 952*c17cf699SJunchao Zhang Ya(q) += alpha * Xa(p); 953*c17cf699SJunchao Zhang q++; 954a587d139SMark } else { 955*c17cf699SJunchao Zhang /* If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y). 956*c17cf699SJunchao Zhang Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong. 957*c17cf699SJunchao Zhang */ 958*c17cf699SJunchao Zhang if (Yi(i) != Yi(i+1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1"); /* auto promote the double NaN if needed */ 959a587d139SMark } 960*c17cf699SJunchao Zhang } 961*c17cf699SJunchao Zhang }); 962*c17cf699SJunchao Zhang }); 963*c17cf699SJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(Y); 964*c17cf699SJunchao Zhang } else { /* different nonzero patterns */ 965*c17cf699SJunchao Zhang Mat Z; 966*c17cf699SJunchao Zhang KokkosCsrMatrix zcsr; 967*c17cf699SJunchao Zhang KernelHandle kh; 968*c17cf699SJunchao Zhang kh.create_spadd_handle(false); 969*c17cf699SJunchao Zhang KokkosSparse::spadd_symbolic(&kh,xkok->csrmat,ykok->csrmat,zcsr); 970*c17cf699SJunchao Zhang KokkosSparse::spadd_numeric(&kh,alpha,xkok->csrmat,(PetscScalar)1.0,ykok->csrmat,zcsr); 971*c17cf699SJunchao Zhang zkok = new Mat_SeqAIJKokkos(zcsr); 972*c17cf699SJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,zkok,&Z);CHKERRQ(ierr); 973*c17cf699SJunchao Zhang ierr = MatHeaderReplace(Y,&Z);CHKERRQ(ierr); 974*c17cf699SJunchao Zhang kh.destroy_spadd_handle(); 975*c17cf699SJunchao Zhang } 976*c17cf699SJunchao Zhang 977a587d139SMark PetscFunctionReturn(0); 978a587d139SMark } 979a587d139SMark 98086a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 9818f7e8f9dSMark Adams { 9828f7e8f9dSMark Adams PetscErrorCode ierr; 9838f7e8f9dSMark Adams 9848f7e8f9dSMark Adams PetscFunctionBegin; 9858f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 9868f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 9878f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 9888f7e8f9dSMark Adams PetscFunctionReturn(0); 9898f7e8f9dSMark Adams } 9908f7e8f9dSMark Adams 9918c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 9928c3ff71bSJunchao Zhang { 993076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 994076ba34aSJunchao Zhang 9958c3ff71bSJunchao Zhang PetscFunctionBegin; 996076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 9976f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 9986f3d89d0SStefano Zampini 9998c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 10008c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 10018c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 1002a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 1003f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 1004a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 1005076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 10068c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 10078c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 10088c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 10098c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 10108c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 10118c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 1012076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 1013152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 1014076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 1015076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 1016076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 1017076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 1018076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 1019076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 1020076ba34aSJunchao Zhang PetscFunctionReturn(0); 1021076ba34aSJunchao Zhang } 1022076ba34aSJunchao Zhang 1023076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 1024076ba34aSJunchao Zhang { 1025076ba34aSJunchao Zhang PetscErrorCode ierr; 1026076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 1027076ba34aSJunchao Zhang PetscInt i,m,n; 1028076ba34aSJunchao Zhang 1029076ba34aSJunchao Zhang PetscFunctionBegin; 1030076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 1031076ba34aSJunchao Zhang 1032076ba34aSJunchao Zhang m = akok->nrows(); 1033076ba34aSJunchao Zhang n = akok->ncols(); 1034076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 1035076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1036076ba34aSJunchao Zhang 1037076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 1038076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1039076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1040076ba34aSJunchao Zhang 1041076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1042076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1043076ba34aSJunchao Zhang 1044076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1045076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1046076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1047076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1048076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1049076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1050076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1051076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1052076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1053076ba34aSJunchao Zhang 1054076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1055076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1056076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1057076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1058076ba34aSJunchao Zhang } 1059076ba34aSJunchao Zhang 1060076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1061076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1062076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1063076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1064076ba34aSJunchao Zhang A->spptr = akok; 1065076ba34aSJunchao Zhang PetscFunctionReturn(0); 1066076ba34aSJunchao Zhang } 1067076ba34aSJunchao Zhang 1068076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1069076ba34aSJunchao Zhang 1070076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1071076ba34aSJunchao Zhang */ 1072076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1073076ba34aSJunchao Zhang { 1074076ba34aSJunchao Zhang PetscErrorCode ierr; 1075076ba34aSJunchao Zhang 1076076ba34aSJunchao Zhang PetscFunctionBegin; 1077076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1078076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 10798c3ff71bSJunchao Zhang PetscFunctionReturn(0); 10808c3ff71bSJunchao Zhang } 10818c3ff71bSJunchao Zhang 10828c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1083152b3e56SJunchao Zhang /*@C 10848c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 10858c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 10868c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 10878c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 10888c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 10898c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 10908c3ff71bSJunchao Zhang 10918c3ff71bSJunchao Zhang Collective 10928c3ff71bSJunchao Zhang 10938c3ff71bSJunchao Zhang Input Parameters: 10948c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 10958c3ff71bSJunchao Zhang . m - number of rows 10968c3ff71bSJunchao Zhang . n - number of columns 10978c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 10988c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 10998c3ff71bSJunchao Zhang (possibly different for each row) or NULL 11008c3ff71bSJunchao Zhang 11018c3ff71bSJunchao Zhang Output Parameter: 11028c3ff71bSJunchao Zhang . A - the matrix 11038c3ff71bSJunchao Zhang 11048c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 11058c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 11068c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 11078c3ff71bSJunchao Zhang 11088c3ff71bSJunchao Zhang Notes: 11098c3ff71bSJunchao Zhang If nnz is given then nz is ignored 11108c3ff71bSJunchao Zhang 11118c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 11128c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 11138c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 11148c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 11158c3ff71bSJunchao Zhang 11168c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 11178c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 11188c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 11198c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 11208c3ff71bSJunchao Zhang 11218c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 11228c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 11238c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 11248c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 11258c3ff71bSJunchao Zhang 11268c3ff71bSJunchao Zhang Level: intermediate 11278c3ff71bSJunchao Zhang 1128076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 11298c3ff71bSJunchao Zhang @*/ 11308c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 11318c3ff71bSJunchao Zhang { 11328c3ff71bSJunchao Zhang PetscErrorCode ierr; 11338c3ff71bSJunchao Zhang 11348c3ff71bSJunchao Zhang PetscFunctionBegin; 11358c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 11368c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 11378c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 11388c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11398c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11408c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11418c3ff71bSJunchao Zhang } 1142930e68a5SMark Adams 11438f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 11448f7e8f9dSMark Adams // 11458f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 11468f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 11478f7e8f9dSMark Adams // 11488f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1149930e68a5SMark Adams { 11508f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 11518f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 11528f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1153930e68a5SMark Adams PetscErrorCode ierr; 11548f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1155076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1156076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1157076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 11588f7e8f9dSMark Adams PetscBool row_identity,col_identity; 11598f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 11608f7e8f9dSMark Adams PetscContainer container; 1161930e68a5SMark Adams 1162930e68a5SMark Adams PetscFunctionBegin; 11638f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 11648f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 11658f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 11668f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 11678f7e8f9dSMark Adams if (container) { 11688f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 11698f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 11708f7e8f9dSMark Adams Nf = (*pNf)%1000; 11718f7e8f9dSMark Adams nv = (*pNf)/1000; 11728f7e8f9dSMark Adams if (nv>0) nVec = nv; 11738f7e8f9dSMark Adams } else Nf = 1; 11748f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 11758f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 11768f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 11778f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 11788f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 11798f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 11808f7e8f9dSMark Adams #endif 11818f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 11828f7e8f9dSMark Adams { 11838f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 11848f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 11858f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 11868f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 11878f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 11888f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 11898f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 11908f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 11918f7e8f9dSMark Adams size_t flops_h = 0.0; 11928f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 11938f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 11948f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 11958f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 11968f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 11978f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 11988f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 11998f7e8f9dSMark Adams // Fill A --> fact 12008f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1201042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 12028f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 12038f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 12048f7e8f9dSMark Adams // zero rows of B 12058f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12068f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 12078f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 12088f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 12098f7e8f9dSMark Adams /* zero (unfactored row) */ 12108f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 12118f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 12128f7e8f9dSMark Adams }); 12138f7e8f9dSMark Adams // copy A into B 12148f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 12158f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 12168f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 12178f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 12188f7e8f9dSMark Adams /* load in initial (unfactored row) */ 12198f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 12208f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 12218f7e8f9dSMark Adams PetscScalar vala = av[j]; 12228f7e8f9dSMark Adams if (colb == rowb) { 12238f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 12248f7e8f9dSMark Adams } else { 12258f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12268f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 12278f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 12288f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 12298f7e8f9dSMark Adams if (pbj[j] == colb) { 12308f7e8f9dSMark Adams pba[j] = vala; 12318f7e8f9dSMark Adams set++; 12328f7e8f9dSMark Adams break; 12338f7e8f9dSMark Adams } 12348f7e8f9dSMark Adams } 12358f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 12368f7e8f9dSMark Adams } 12378f7e8f9dSMark Adams } 12388f7e8f9dSMark Adams }); 12398f7e8f9dSMark Adams }); 12408f7e8f9dSMark Adams Kokkos::fence(); 1241930e68a5SMark Adams 12428f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 12438f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12448f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12458f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1246042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 12478f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 12488f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 12498f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 12508f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 12518f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 12528f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 12538f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 12548f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 12558f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 12568f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 12578f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 12588f7e8f9dSMark Adams else { 12598f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 12608f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 12618f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 12628f7e8f9dSMark Adams size_t st_idx; 12638f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 12648f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 12658f7e8f9dSMark Adams // get column, there has got to be a better way 12668f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 12678f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 12688f7e8f9dSMark Adams if (pjL[j] == ii) { 12698f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 12708f7e8f9dSMark Adams idx = j; // output 12718f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 12728f7e8f9dSMark Adams } 12738f7e8f9dSMark Adams }, st_idx); 12748f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 12758f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 12768f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 12778f7e8f9dSMark Adams // U(i+1,:end) 12788f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 12798f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 12808f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 12818f7e8f9dSMark Adams if (col==myk) { 12828f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 12838f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 12848f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 12858f7e8f9dSMark Adams } else { 12868f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 12878f7e8f9dSMark Adams PetscInt high, low; 12888f7e8f9dSMark Adams const PetscInt *startj; 12898f7e8f9dSMark Adams if (col<myk) { // L 12908f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 12918f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 12928f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 12938f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 12948f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 12958f7e8f9dSMark Adams } else { 12968f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 12978f7e8f9dSMark Adams start = ba_d + idx; 12988f7e8f9dSMark Adams startj= bj_d + idx; 12998f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 13008f7e8f9dSMark Adams } 13018f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 13028f7e8f9dSMark Adams low = 0; 13038f7e8f9dSMark Adams high = (PetscInt)(end-start); 13048f7e8f9dSMark Adams while (high-low > 5) { 13058f7e8f9dSMark Adams int t = (low+high)/2; 13068f7e8f9dSMark Adams if (startj[t] > col) high = t; 13078f7e8f9dSMark Adams else low = t; 13088f7e8f9dSMark Adams } 13098f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 13108f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 13118f7e8f9dSMark Adams } 13128f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 13138f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 13148f7e8f9dSMark Adams } 13158f7e8f9dSMark Adams }); 13168f7e8f9dSMark Adams } 13178f7e8f9dSMark Adams } 13188f7e8f9dSMark Adams }); 13198f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 13208f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 13218f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 13228f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 13238f7e8f9dSMark Adams }); 13248f7e8f9dSMark Adams Kokkos::fence(); 13258f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 13268f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13278f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13288f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 13298f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 13308f7e8f9dSMark Adams #endif 13318f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 13328f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 13338f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 13348f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 13358f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 13368f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 13378f7e8f9dSMark Adams if (i < end) { 13388f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 13398f7e8f9dSMark Adams *pv = 1.0/(*pv); 13408f7e8f9dSMark Adams } 13418f7e8f9dSMark Adams }); 13428f7e8f9dSMark Adams }); 13438f7e8f9dSMark Adams } 13448f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13458f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 13468f7e8f9dSMark Adams #endif 13478f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 13488f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 13498f7e8f9dSMark Adams 13508f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 13518f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 13528f7e8f9dSMark Adams if (b->inode.size) { 13538f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 13548f7e8f9dSMark Adams } else if (row_identity && col_identity) { 13558f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 13568f7e8f9dSMark Adams } else { 13578f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 13588f7e8f9dSMark Adams } 13598f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 13608f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 13618f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 13628f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 13638f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 13648f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 13658f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 13668f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 13678f7e8f9dSMark Adams 1368930e68a5SMark Adams PetscFunctionReturn(0); 1369930e68a5SMark Adams } 1370930e68a5SMark Adams 137186a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1372930e68a5SMark Adams { 1373930e68a5SMark Adams PetscErrorCode ierr; 1374930e68a5SMark Adams 1375930e68a5SMark Adams PetscFunctionBegin; 1376930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 137786a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 137886a27549SJunchao Zhang PetscFunctionReturn(0); 137986a27549SJunchao Zhang } 138086a27549SJunchao Zhang 138186a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 138286a27549SJunchao Zhang { 138386a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 138486a27549SJunchao Zhang 138586a27549SJunchao Zhang PetscFunctionBegin; 138686a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 138786a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 138886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 138986a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 139086a27549SJunchao Zhang } 139186a27549SJunchao Zhang PetscFunctionReturn(0); 139286a27549SJunchao Zhang } 139386a27549SJunchao Zhang 139486a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 139586a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 139686a27549SJunchao Zhang { 139786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1398076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 139986a27549SJunchao Zhang 140086a27549SJunchao Zhang PetscFunctionBegin; 140186a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 140286a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1403076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 140486a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1405076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1406076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 140786a27549SJunchao Zhang 140886a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1409076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1410076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1411076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 141286a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 141386a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 141486a27549SJunchao Zhang 141586a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 141686a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 141786a27549SJunchao Zhang */ 1418076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1419076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 142086a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 142186a27549SJunchao Zhang 142286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 142386a27549SJunchao Zhang 142486a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1425076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 142686a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1427076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1428076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 142986a27549SJunchao Zhang 143086a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1431076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1432076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1433076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 143486a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 143586a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 143686a27549SJunchao Zhang 143786a27549SJunchao Zhang /* Sort indices. See comments above */ 1438076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1439076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 144086a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 144186a27549SJunchao Zhang 144286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 144386a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 144486a27549SJunchao Zhang } 144586a27549SJunchao Zhang PetscFunctionReturn(0); 144686a27549SJunchao Zhang } 144786a27549SJunchao Zhang 144886a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 144986a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 145086a27549SJunchao Zhang { 145186a27549SJunchao Zhang PetscErrorCode ierr; 145286a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 145386a27549SJunchao Zhang PetscScalarKokkosView xv; 145486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 145586a27549SJunchao Zhang 145686a27549SJunchao Zhang PetscFunctionBegin; 145786a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 145886a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 145986a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 146086a27549SJunchao Zhang /* Solve L tmpv = b */ 14613f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 146286a27549SJunchao Zhang /* Solve Ux = tmpv */ 14633f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 146486a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 146586a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 146686a27549SJunchao Zhang PetscFunctionReturn(0); 146786a27549SJunchao Zhang } 146886a27549SJunchao Zhang 1469076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 147086a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 147186a27549SJunchao Zhang { 147286a27549SJunchao Zhang PetscErrorCode ierr; 147386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 147486a27549SJunchao Zhang PetscScalarKokkosView xv; 147586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 147686a27549SJunchao Zhang 147786a27549SJunchao Zhang PetscFunctionBegin; 147886a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 147986a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 148086a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 148186a27549SJunchao Zhang /* Solve U^T tmpv = b */ 148286a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 148386a27549SJunchao Zhang 148486a27549SJunchao Zhang /* Solve L^T x = tmpv */ 148586a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 148686a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 148786a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 148886a27549SJunchao Zhang PetscFunctionReturn(0); 148986a27549SJunchao Zhang } 149086a27549SJunchao Zhang 149186a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 149286a27549SJunchao Zhang { 149386a27549SJunchao Zhang PetscErrorCode ierr; 149486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 149586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 149686a27549SJunchao Zhang PetscInt fill_lev = info->levels; 149786a27549SJunchao Zhang 149886a27549SJunchao Zhang PetscFunctionBegin; 149986a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1500076ba34aSJunchao Zhang 1501076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1502076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1503076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1504076ba34aSJunchao Zhang 1505076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 150686a27549SJunchao Zhang 150786a27549SJunchao Zhang B->assembled = PETSC_TRUE; 150886a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 150986a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 151086a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 151186a27549SJunchao Zhang B->ops->matsolve = NULL; 151286a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 151386a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 151486a27549SJunchao Zhang 151586a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 151686a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 151786a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 151886a27549SJunchao Zhang PetscFunctionReturn(0); 151986a27549SJunchao Zhang } 152086a27549SJunchao Zhang 152186a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 152286a27549SJunchao Zhang { 152386a27549SJunchao Zhang PetscErrorCode ierr; 152486a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 152586a27549SJunchao Zhang Mat_SeqAIJ *b; 152686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 152786a27549SJunchao Zhang PetscInt fill_lev = info->levels; 152886a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 152986a27549SJunchao Zhang PetscInt n = A->rmap->n; 153086a27549SJunchao Zhang 153186a27549SJunchao Zhang PetscFunctionBegin; 153286a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 153386a27549SJunchao Zhang /* Rebuild factors */ 153486a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 153586a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 153686a27549SJunchao Zhang 153786a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 153886a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 153986a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 154086a27549SJunchao Zhang 154186a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 154286a27549SJunchao Zhang 154386a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 154486a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 154586a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 154686a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 154786a27549SJunchao Zhang 154886a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1549076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1550076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1551076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 155286a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 155386a27549SJunchao Zhang 155486a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 155586a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 155686a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 155786a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 155886a27549SJunchao Zhang 155986a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 156086a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 156186a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 156286a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 156386a27549SJunchao Zhang #else 156486a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 156586a27549SJunchao Zhang #endif 156686a27549SJunchao Zhang 156786a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 156886a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 156986a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 157086a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 157186a27549SJunchao Zhang 157286a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 157386a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 157486a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 157586a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 157686a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 157786a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 157886a27549SJunchao Zhang 157986a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 158086a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1581930e68a5SMark Adams PetscFunctionReturn(0); 1582930e68a5SMark Adams } 1583930e68a5SMark Adams 15848f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 15858f7e8f9dSMark Adams { 15868f7e8f9dSMark Adams PetscErrorCode ierr; 15878f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 15888f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1589930e68a5SMark Adams 15908f7e8f9dSMark Adams PetscFunctionBegin; 15918f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 15928f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 15938f7e8f9dSMark Adams // move B data into Kokkos 15948f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 15958f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 15968f7e8f9dSMark Adams { 15978f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 15988f7e8f9dSMark Adams if (!baijkok->diag_d) { 15998f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1600152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 16018f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 16028f7e8f9dSMark Adams } 16038f7e8f9dSMark Adams } 16048f7e8f9dSMark Adams PetscFunctionReturn(0); 16058f7e8f9dSMark Adams } 16068f7e8f9dSMark Adams 160786a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1608930e68a5SMark Adams { 1609930e68a5SMark Adams PetscFunctionBegin; 1610930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1611930e68a5SMark Adams PetscFunctionReturn(0); 1612930e68a5SMark Adams } 1613930e68a5SMark Adams 16148f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 16158f7e8f9dSMark Adams { 16168f7e8f9dSMark Adams PetscFunctionBegin; 16178f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 16188f7e8f9dSMark Adams PetscFunctionReturn(0); 16198f7e8f9dSMark Adams } 16208f7e8f9dSMark Adams 1621930e68a5SMark Adams /*MC 162286a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 162386a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1624930e68a5SMark Adams 1625930e68a5SMark Adams Level: beginner 1626930e68a5SMark Adams 162786a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1628930e68a5SMark Adams M*/ 162986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1630930e68a5SMark Adams { 1631930e68a5SMark Adams PetscErrorCode ierr; 1632930e68a5SMark Adams PetscInt n = A->rmap->n; 1633930e68a5SMark Adams 1634930e68a5SMark Adams PetscFunctionBegin; 1635930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1636930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1637930e68a5SMark Adams (*B)->factortype = ftype; 16384ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1639930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1640930e68a5SMark Adams 16418f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1642930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 164386a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 164486a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 164586a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 164686a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 164786a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 164886a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 164986a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1650930e68a5SMark Adams 1651930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 165286a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1653930e68a5SMark Adams PetscFunctionReturn(0); 1654930e68a5SMark Adams } 16558f7e8f9dSMark Adams 16568f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 16578f7e8f9dSMark Adams { 16588f7e8f9dSMark Adams PetscErrorCode ierr; 16598f7e8f9dSMark Adams PetscInt n = A->rmap->n; 16608f7e8f9dSMark Adams 16618f7e8f9dSMark Adams PetscFunctionBegin; 16628f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 16638f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 16648f7e8f9dSMark Adams (*B)->factortype = ftype; 1665f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 16664ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 16678f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 16688f7e8f9dSMark Adams 16698f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 16708f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 16718f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 16728f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 16738f7e8f9dSMark Adams 16748f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 16758f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 16768f7e8f9dSMark Adams PetscFunctionReturn(0); 16778f7e8f9dSMark Adams } 167886a27549SJunchao Zhang 167986a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 168086a27549SJunchao Zhang { 168186a27549SJunchao Zhang PetscErrorCode ierr; 168286a27549SJunchao Zhang 168386a27549SJunchao Zhang PetscFunctionBegin; 168486a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 168586a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 168686a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 168786a27549SJunchao Zhang PetscFunctionReturn(0); 168886a27549SJunchao Zhang } 168986a27549SJunchao Zhang 1690076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1691076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1692076ba34aSJunchao Zhang { 1693076ba34aSJunchao Zhang PetscErrorCode ierr; 1694076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1695076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1696076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1697076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1698076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1699076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1700076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1701076ba34aSJunchao Zhang 1702076ba34aSJunchao Zhang PetscFunctionBegin; 1703076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1704076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1705076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1706076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1707076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1708076ba34aSJunchao Zhang } 1709076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1710076ba34aSJunchao Zhang } 1711076ba34aSJunchao Zhang PetscFunctionReturn(0); 1712076ba34aSJunchao Zhang } 1713