111d22bbfSJunchao Zhang #include <petscvec_kokkos.hpp> 2*076ba34aSJunchao Zhang #include <petscpkg_version.h> 3152b3e56SJunchao Zhang #include <petsc/private/petscimpl.h> 48c3ff71bSJunchao Zhang #include <petscsystypes.h> 58c3ff71bSJunchao Zhang #include <petscerror.h> 68c3ff71bSJunchao Zhang 78c3ff71bSJunchao Zhang #include <Kokkos_Core.hpp> 8f0cf5187SStefano Zampini #include <KokkosBlas.hpp> 9*076ba34aSJunchao Zhang #include <KokkosKernels_Sorting.hpp> 108c3ff71bSJunchao Zhang #include <KokkosSparse_CrsMatrix.hpp> 118c3ff71bSJunchao Zhang #include <KokkosSparse_spmv.hpp> 1286a27549SJunchao Zhang #include <KokkosSparse_spiluk.hpp> 1386a27549SJunchao Zhang #include <KokkosSparse_sptrsv.hpp> 14*076ba34aSJunchao Zhang #include <KokkosSparse_spgemm.hpp> 15*076ba34aSJunchao Zhang #include <KokkosSparse_spadd.hpp> 1686a27549SJunchao Zhang 178c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/aij.h> 188c3ff71bSJunchao Zhang #include <../src/mat/impls/aij/seq/kokkos/aijkokkosimpl.hpp> 198c3ff71bSJunchao Zhang 208c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */ 218c3ff71bSJunchao Zhang 22*076ba34aSJunchao Zhang /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after 23*076ba34aSJunchao Zhang we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult). 24*076ba34aSJunchao Zhang In the latter case, it is important to set a_dual's sync state correctly. 25*076ba34aSJunchao Zhang */ 268c3ff71bSJunchao Zhang static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode) 278c3ff71bSJunchao Zhang { 288c3ff71bSJunchao Zhang PetscErrorCode ierr; 29*076ba34aSJunchao Zhang Mat_SeqAIJ *aijseq; 30*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 318c3ff71bSJunchao Zhang 328c3ff71bSJunchao Zhang PetscFunctionBegin; 33*076ba34aSJunchao Zhang if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0); 348c3ff71bSJunchao Zhang ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr); 35*076ba34aSJunchao Zhang 36*076ba34aSJunchao Zhang aijseq = static_cast<Mat_SeqAIJ*>(A->data); 37*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 38*076ba34aSJunchao Zhang 39*076ba34aSJunchao Zhang /* If aijkok does not exist, we just copy i, j to device. 40*076ba34aSJunchao Zhang If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host. 41*076ba34aSJunchao Zhang In both cases, we build a new aijkok structure. 42*076ba34aSJunchao Zhang */ 43*076ba34aSJunchao Zhang if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */ 44*076ba34aSJunchao Zhang delete aijkok; 45*076ba34aSJunchao Zhang aijkok = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aijseq->nz,aijseq->i,aijseq->j,aijseq->a,A->nonzerostate,PETSC_FALSE/*don't copy mat values to device*/); 46*076ba34aSJunchao Zhang A->spptr = aijkok; 47*076ba34aSJunchao Zhang } 48*076ba34aSJunchao Zhang 49a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 50a587d139SMark A->offloadmask = PETSC_OFFLOAD_GPU; // in GPU mode, no going back. MatSetValues checks this 51a587d139SMark } 528c3ff71bSJunchao Zhang PetscFunctionReturn(0); 538c3ff71bSJunchao Zhang } 548c3ff71bSJunchao Zhang 5586a27549SJunchao Zhang /* Sync CSR data to device if not yet */ 56*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A) 578c3ff71bSJunchao Zhang { 588c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 598c3ff71bSJunchao Zhang 608c3ff71bSJunchao Zhang PetscFunctionBegin; 6186a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from host to device"); 62*076ba34aSJunchao Zhang if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync unassembled matrix from host to device"); 63*076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 64*076ba34aSJunchao Zhang if (aijkok->a_dual.need_sync_device()) { 65*076ba34aSJunchao Zhang aijkok->a_dual.sync_device(); 66*076ba34aSJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; /* values of the tranpose is out-of-date */ 6786a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 688c3ff71bSJunchao Zhang } 698c3ff71bSJunchao Zhang PetscFunctionReturn(0); 708c3ff71bSJunchao Zhang } 718c3ff71bSJunchao Zhang 72*076ba34aSJunchao Zhang /* Mark the CSR data on device as modified */ 73*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A) 7486a27549SJunchao Zhang { 7586a27549SJunchao Zhang PetscErrorCode ierr; 7686a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 7786a27549SJunchao Zhang 7886a27549SJunchao Zhang PetscFunctionBegin; 7986a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Not supported for factorized matries"); 8086a27549SJunchao Zhang aijkok->a_dual.clear_sync_state(); 8186a27549SJunchao Zhang aijkok->a_dual.modify_device(); 8286a27549SJunchao Zhang aijkok->transpose_updated = PETSC_FALSE; 8386a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_FALSE; 8486a27549SJunchao Zhang ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 8586a27549SJunchao Zhang PetscFunctionReturn(0); 8686a27549SJunchao Zhang } 8786a27549SJunchao Zhang 88f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A) 89f0cf5187SStefano Zampini { 90f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 91f0cf5187SStefano Zampini 92f0cf5187SStefano Zampini PetscFunctionBegin; 93f0cf5187SStefano Zampini PetscCheckTypeName(A,MATSEQAIJKOKKOS); 9486a27549SJunchao Zhang /* We do not expect one needs factors on host */ 9586a27549SJunchao Zhang if (A->factortype != MAT_FACTOR_NONE) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Cann't sync factorized matrix from device to host"); 96f0cf5187SStefano Zampini if (!aijkok) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"Missing AIJKOK"); 97*076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 98f0cf5187SStefano Zampini PetscFunctionReturn(0); 99f0cf5187SStefano Zampini } 100f0cf5187SStefano Zampini 101f0cf5187SStefano Zampini static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 102f0cf5187SStefano Zampini { 103*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 104f0cf5187SStefano Zampini 105f0cf5187SStefano Zampini PetscFunctionBegin; 106*076ba34aSJunchao Zhang if (aijkok) { 107*076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 108*076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 109*076ba34aSJunchao Zhang } else { /* Happens when calling MatSetValues on a newly created matrix */ 110*076ba34aSJunchao Zhang *array = static_cast<Mat_SeqAIJ*>(A->data)->a; 111*076ba34aSJunchao Zhang } 112*076ba34aSJunchao Zhang PetscFunctionReturn(0); 113*076ba34aSJunchao Zhang } 114*076ba34aSJunchao Zhang 115*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar *array[]) 116*076ba34aSJunchao Zhang { 117*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 118*076ba34aSJunchao Zhang 119*076ba34aSJunchao Zhang PetscFunctionBegin; 120*076ba34aSJunchao Zhang if (aijkok) aijkok->a_dual.modify_host(); 121*076ba34aSJunchao Zhang PetscFunctionReturn(0); 122*076ba34aSJunchao Zhang } 123*076ba34aSJunchao Zhang 124*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 125*076ba34aSJunchao Zhang { 126*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 127*076ba34aSJunchao Zhang 128*076ba34aSJunchao Zhang PetscFunctionBegin; 129*076ba34aSJunchao Zhang aijkok->a_dual.sync_host(); 130*076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 131*076ba34aSJunchao Zhang PetscFunctionReturn(0); 132*076ba34aSJunchao Zhang } 133*076ba34aSJunchao Zhang 134*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar *array[]) 135*076ba34aSJunchao Zhang { 136*076ba34aSJunchao Zhang PetscFunctionBegin; 137*076ba34aSJunchao Zhang *array = NULL; 138*076ba34aSJunchao Zhang PetscFunctionReturn(0); 139*076ba34aSJunchao Zhang } 140*076ba34aSJunchao Zhang 141*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 142*076ba34aSJunchao Zhang { 143*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 144*076ba34aSJunchao Zhang 145*076ba34aSJunchao Zhang PetscFunctionBegin; 146*076ba34aSJunchao Zhang *array = aijkok->a_dual.view_host().data(); 147*076ba34aSJunchao Zhang PetscFunctionReturn(0); 148*076ba34aSJunchao Zhang } 149*076ba34aSJunchao Zhang 150*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar *array[]) 151*076ba34aSJunchao Zhang { 152*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 153*076ba34aSJunchao Zhang 154*076ba34aSJunchao Zhang PetscFunctionBegin; 155*076ba34aSJunchao Zhang aijkok->a_dual.clear_sync_state(); 156*076ba34aSJunchao Zhang aijkok->a_dual.modify_host(); 157f0cf5187SStefano Zampini PetscFunctionReturn(0); 158f0cf5187SStefano Zampini } 159f0cf5187SStefano Zampini 160a587d139SMark // MatSeqAIJKokkosSetDeviceMat takes a PetscSplitCSRDataStructure with device data and copies it to the device. Note, "deep_copy" here is really a shallow copy 161042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosSetDeviceMat(Mat A, PetscSplitCSRDataStructure h_mat) 162a587d139SMark { 163042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 164042217e8SBarry Smith Kokkos::View<SplitCSRMat, Kokkos::HostSpace> h_mat_k(h_mat); 165a587d139SMark 166a587d139SMark PetscFunctionBegin; 167*076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 168152b3e56SJunchao Zhang aijkok->device_mat_d = create_mirror(DefaultMemorySpace(),h_mat_k); 169a587d139SMark Kokkos::deep_copy (aijkok->device_mat_d, h_mat_k); 170a587d139SMark PetscFunctionReturn(0); 171a587d139SMark } 172a587d139SMark 173a587d139SMark // MatSeqAIJKokkosGetDeviceMat gets the device if it is here, otherwise it creates a place for it and returns NULL 174042217e8SBarry Smith PetscErrorCode MatSeqAIJKokkosGetDeviceMat(Mat A, PetscSplitCSRDataStructure *d_mat) 175a587d139SMark { 176042217e8SBarry Smith Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 177a587d139SMark 178a587d139SMark PetscFunctionBegin; 179a587d139SMark if (aijkok && aijkok->device_mat_d.data()) { 180a587d139SMark *d_mat = aijkok->device_mat_d.data(); 181a587d139SMark } else { 182a587d139SMark PetscErrorCode ierr; 183a587d139SMark ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok (we are making d_mat now so make a place for it) 184a587d139SMark *d_mat = NULL; 185a587d139SMark } 186a587d139SMark PetscFunctionReturn(0); 187a587d139SMark } 188*076ba34aSJunchao Zhang 189*076ba34aSJunchao Zhang /* Generate the transpose on device and cache it internally */ 190*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix **csrmatT) 191152b3e56SJunchao Zhang { 192152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 193152b3e56SJunchao Zhang 194152b3e56SJunchao Zhang PetscFunctionBegin; 195*076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 196*076ba34aSJunchao Zhang if (!aijkok->csrmatT.nnz() || !aijkok->transpose_updated) { /* Generate At for the first time OR just update its values */ 197*076ba34aSJunchao Zhang /* FIXME: KK does not separate symbolic/numeric transpose. We could have a permutation array to help value-only update */ 198*076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 199*076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatT = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 200*076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatT)); 20186a27549SJunchao Zhang aijkok->transpose_updated = PETSC_TRUE; 202*076ba34aSJunchao Zhang } 203*076ba34aSJunchao Zhang *csrmatT = &aijkok->csrmatT; 204152b3e56SJunchao Zhang PetscFunctionReturn(0); 205152b3e56SJunchao Zhang } 206152b3e56SJunchao Zhang 207*076ba34aSJunchao Zhang /* Generate the Hermitian on device and cache it internally */ 208*076ba34aSJunchao Zhang static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix **csrmatH) 209152b3e56SJunchao Zhang { 210152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 211152b3e56SJunchao Zhang 212152b3e56SJunchao Zhang PetscFunctionBegin; 213*076ba34aSJunchao Zhang if (!aijkok) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr"); 214*076ba34aSJunchao Zhang if (!aijkok->csrmatH.nnz() || !aijkok->hermitian_updated) { /* Generate Ah for the first time OR just update its values */ 215*076ba34aSJunchao Zhang CHKERRCXX(aijkok->a_dual.sync_device()); 216*076ba34aSJunchao Zhang CHKERRCXX(aijkok->csrmatH = KokkosKernels::Impl::transpose_matrix(aijkok->csrmat)); 217*076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(aijkok->csrmatH)); 218*076ba34aSJunchao Zhang #if defined(PETSC_USE_COMPLEX) 219*076ba34aSJunchao Zhang const auto& a = aijkok->csrmatH.values; 220*076ba34aSJunchao Zhang Kokkos::parallel_for(a.extent(0),KOKKOS_LAMBDA(MatRowMapType i) {a(i) = PetscConj(a(i));}); 221*076ba34aSJunchao Zhang #endif 22286a27549SJunchao Zhang aijkok->hermitian_updated = PETSC_TRUE; 223*076ba34aSJunchao Zhang } 224*076ba34aSJunchao Zhang *csrmatH = &aijkok->csrmatH; 225152b3e56SJunchao Zhang PetscFunctionReturn(0); 226152b3e56SJunchao Zhang } 227a587d139SMark 2288c3ff71bSJunchao Zhang /* y = A x */ 2298c3ff71bSJunchao Zhang static PetscErrorCode MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2308c3ff71bSJunchao Zhang { 2318c3ff71bSJunchao Zhang PetscErrorCode ierr; 2328c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 233152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 234152b3e56SJunchao Zhang PetscScalarKokkosView yv; 2358c3ff71bSJunchao Zhang 2368c3ff71bSJunchao Zhang PetscFunctionBegin; 2378c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 238152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 239152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 2408c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 241152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,0.0/*beta*/,yv); /* y = alpha A x + beta y */ 242152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 243152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 244bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 245*076ba34aSJunchao Zhang /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */ 246152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 2478c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2488c3ff71bSJunchao Zhang } 2498c3ff71bSJunchao Zhang 2508c3ff71bSJunchao Zhang /* y = A^T x */ 2518c3ff71bSJunchao Zhang static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2528c3ff71bSJunchao Zhang { 2538c3ff71bSJunchao Zhang PetscErrorCode ierr; 2548c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 255152b3e56SJunchao Zhang const char *mode; 256152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 257152b3e56SJunchao Zhang PetscScalarKokkosView yv; 258*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2598c3ff71bSJunchao Zhang 2608c3ff71bSJunchao Zhang PetscFunctionBegin; 2618c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 262152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 263152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 264152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 265*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 266152b3e56SJunchao Zhang mode = "N"; 267152b3e56SJunchao Zhang } else { 268*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 269*076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 270152b3e56SJunchao Zhang mode = "T"; 271152b3e56SJunchao Zhang } 272*076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^T x + beta y */ 273152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 274152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 275bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 276*076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 2778c3ff71bSJunchao Zhang PetscFunctionReturn(0); 2788c3ff71bSJunchao Zhang } 2798c3ff71bSJunchao Zhang 2808c3ff71bSJunchao Zhang /* y = A^H x */ 2818c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy) 2828c3ff71bSJunchao Zhang { 2838c3ff71bSJunchao Zhang PetscErrorCode ierr; 2848c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 285152b3e56SJunchao Zhang const char *mode; 286152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv; 287152b3e56SJunchao Zhang PetscScalarKokkosView yv; 288*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 2898c3ff71bSJunchao Zhang 2908c3ff71bSJunchao Zhang PetscFunctionBegin; 2918c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 292152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 293152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 294152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 295*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 296152b3e56SJunchao Zhang mode = "N"; 297152b3e56SJunchao Zhang } else { 298*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 299*076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 300152b3e56SJunchao Zhang mode = "C"; 301152b3e56SJunchao Zhang } 302*076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,0.0/*beta*/,yv); /* y = alpha A^H x + beta y */ 303152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 304152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 305bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 306*076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3078c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3088c3ff71bSJunchao Zhang } 3098c3ff71bSJunchao Zhang 3108c3ff71bSJunchao Zhang /* z = A x + y */ 3118c3ff71bSJunchao Zhang static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy, Vec zz) 3128c3ff71bSJunchao Zhang { 3138c3ff71bSJunchao Zhang PetscErrorCode ierr; 3148c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 315152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 316152b3e56SJunchao Zhang PetscScalarKokkosView zv; 3178c3ff71bSJunchao Zhang 3188c3ff71bSJunchao Zhang PetscFunctionBegin; 3198c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 320152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 321152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 322152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3238c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 3248c3ff71bSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 325152b3e56SJunchao Zhang KokkosSparse::spmv("N",1.0/*alpha*/,aijkok->csrmat,xv,1.0/*beta*/,zv); /* z = alpha A x + beta z */ 326152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 327152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 328152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 329bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 330152b3e56SJunchao Zhang ierr = PetscLogGpuFlops(2.0*aijkok->csrmat.nnz());CHKERRQ(ierr); 3318c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3328c3ff71bSJunchao Zhang } 3338c3ff71bSJunchao Zhang 3348c3ff71bSJunchao Zhang /* z = A^T x + y */ 3358c3ff71bSJunchao Zhang static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3368c3ff71bSJunchao Zhang { 3378c3ff71bSJunchao Zhang PetscErrorCode ierr; 3388c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 339152b3e56SJunchao Zhang const char *mode; 340152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 341152b3e56SJunchao Zhang PetscScalarKokkosView zv; 342*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3438c3ff71bSJunchao Zhang 3448c3ff71bSJunchao Zhang PetscFunctionBegin; 3458c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 346152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 347152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 348152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3498c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 350152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 351*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmat);CHKERRQ(ierr); 352152b3e56SJunchao Zhang mode = "N"; 353152b3e56SJunchao Zhang } else { 354*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 355*076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 356152b3e56SJunchao Zhang mode = "T"; 357152b3e56SJunchao Zhang } 358*076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^T x + beta z */ 359152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 360152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 361152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 362bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 363*076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 3648c3ff71bSJunchao Zhang PetscFunctionReturn(0); 3658c3ff71bSJunchao Zhang } 3668c3ff71bSJunchao Zhang 3678c3ff71bSJunchao Zhang /* z = A^H x + y */ 3688c3ff71bSJunchao Zhang static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz) 3698c3ff71bSJunchao Zhang { 3708c3ff71bSJunchao Zhang PetscErrorCode ierr; 3718c3ff71bSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 372152b3e56SJunchao Zhang const char *mode; 373152b3e56SJunchao Zhang ConstPetscScalarKokkosView xv,yv; 374152b3e56SJunchao Zhang PetscScalarKokkosView zv; 375*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmat; 3768c3ff71bSJunchao Zhang 3778c3ff71bSJunchao Zhang PetscFunctionBegin; 3788c3ff71bSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 379152b3e56SJunchao Zhang ierr = VecGetKokkosView(xx,&xv);CHKERRQ(ierr); 380152b3e56SJunchao Zhang ierr = VecGetKokkosView(yy,&yv);CHKERRQ(ierr); 381152b3e56SJunchao Zhang ierr = VecGetKokkosView(zz,&zv);CHKERRQ(ierr); 3828c3ff71bSJunchao Zhang if (zz != yy) Kokkos::deep_copy(zv,yv); 383152b3e56SJunchao Zhang if (A->form_explicit_transpose) { 384*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateHermitian_Private(A,&csrmat);CHKERRQ(ierr); 385152b3e56SJunchao Zhang mode = "N"; 386152b3e56SJunchao Zhang } else { 387*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 388*076ba34aSJunchao Zhang csrmat = &aijkok->csrmat; 389152b3e56SJunchao Zhang mode = "C"; 390152b3e56SJunchao Zhang } 391*076ba34aSJunchao Zhang KokkosSparse::spmv(mode,1.0/*alpha*/,*csrmat,xv,1.0/*beta*/,zv); /* z = alpha A^H x + beta z */ 392152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(xx,&xv);CHKERRQ(ierr); 393152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(yy,&yv);CHKERRQ(ierr); 394152b3e56SJunchao Zhang ierr = VecRestoreKokkosView(zz,&zv);CHKERRQ(ierr); 395bb2d6e60SJunchao Zhang ierr = WaitForKokkos();CHKERRQ(ierr); 396*076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(2.0*csrmat->nnz());CHKERRQ(ierr); 397152b3e56SJunchao Zhang PetscFunctionReturn(0); 398152b3e56SJunchao Zhang } 399152b3e56SJunchao Zhang 400152b3e56SJunchao Zhang PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg) 401152b3e56SJunchao Zhang { 402152b3e56SJunchao Zhang PetscErrorCode ierr; 403152b3e56SJunchao Zhang Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 404152b3e56SJunchao Zhang 405152b3e56SJunchao Zhang PetscFunctionBegin; 406152b3e56SJunchao Zhang switch (op) { 407152b3e56SJunchao Zhang case MAT_FORM_EXPLICIT_TRANSPOSE: 408152b3e56SJunchao Zhang /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */ 409152b3e56SJunchao Zhang if (A->form_explicit_transpose && !flg && aijkok) {ierr = aijkok->DestroyMatTranspose();CHKERRQ(ierr);} 410152b3e56SJunchao Zhang A->form_explicit_transpose = flg; 411152b3e56SJunchao Zhang break; 412152b3e56SJunchao Zhang default: 413152b3e56SJunchao Zhang ierr = MatSetOption_SeqAIJ(A,op,flg);CHKERRQ(ierr); 414152b3e56SJunchao Zhang break; 415152b3e56SJunchao Zhang } 4168c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4178c3ff71bSJunchao Zhang } 4188c3ff71bSJunchao Zhang 419*076ba34aSJunchao Zhang /* Depending on reuse, either build a new mat, or use the existing mat */ 4203d0639e7SStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat* newmat) 4218c3ff71bSJunchao Zhang { 4228c3ff71bSJunchao Zhang PetscErrorCode ierr; 423*076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 4248c3ff71bSJunchao Zhang 4258c3ff71bSJunchao Zhang PetscFunctionBegin; 426a3f881fbSStefano Zampini ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 427*076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */ 428*076ba34aSJunchao Zhang ierr = MatDuplicate(A,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); /* the returned newmat is a SeqAIJKokkos */ 4298c3ff71bSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */ 430*076ba34aSJunchao Zhang ierr = MatCopy(A,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); /* newmat is already a SeqAIJKokkos */ 431*076ba34aSJunchao Zhang } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */ 432*076ba34aSJunchao Zhang if (A != *newmat) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_PLIB,"A != *newmat with MAT_INPLACE_MATRIX"); 433*076ba34aSJunchao Zhang ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 434*076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&A->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 435*076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 436*076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(A);CHKERRQ(ierr); 437*076ba34aSJunchao Zhang aseq = static_cast<Mat_SeqAIJ*>(A->data); 438*076ba34aSJunchao Zhang if (A->assembled) { /* Copy i, j to device for an assembled matrix if not yet */ 439*076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_PLIB,"Expect NULL (Mat_SeqAIJKokkos*)A->spptr"); 440*076ba34aSJunchao Zhang A->spptr = new Mat_SeqAIJKokkos(A->rmap->n,A->cmap->n,aseq->nz,aseq->i,aseq->j,aseq->a,A->nonzerostate,PETSC_FALSE); 4418c3ff71bSJunchao Zhang } 442*076ba34aSJunchao Zhang } 4438c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4448c3ff71bSJunchao Zhang } 4458c3ff71bSJunchao Zhang 446*076ba34aSJunchao Zhang /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or 447*076ba34aSJunchao Zhang an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix. 448*076ba34aSJunchao Zhang */ 449*076ba34aSJunchao Zhang static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat *B) 4508c3ff71bSJunchao Zhang { 4518c3ff71bSJunchao Zhang PetscErrorCode ierr; 452*076ba34aSJunchao Zhang Mat_SeqAIJ *bseq; 453*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr),*bkok; 454*076ba34aSJunchao Zhang Mat mat; 4558c3ff71bSJunchao Zhang 4568c3ff71bSJunchao Zhang PetscFunctionBegin; 457*076ba34aSJunchao Zhang /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */ 458*076ba34aSJunchao Zhang ierr = MatDuplicate_SeqAIJ(A,MAT_DO_NOT_COPY_VALUES,B);CHKERRQ(ierr); 459*076ba34aSJunchao Zhang mat = *B; 460*076ba34aSJunchao Zhang if (A->assembled) { 461*076ba34aSJunchao Zhang bseq = static_cast<Mat_SeqAIJ*>(mat->data); 462*076ba34aSJunchao Zhang bkok = new Mat_SeqAIJKokkos(mat->rmap->n,mat->cmap->n,bseq->nz,bseq->i,bseq->j,bseq->a,mat->nonzerostate,PETSC_FALSE); 463*076ba34aSJunchao Zhang bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */ 464*076ba34aSJunchao Zhang /* Now copy values to B if needed */ 465*076ba34aSJunchao Zhang if (dupOption == MAT_COPY_VALUES) { 466*076ba34aSJunchao Zhang if (akok->a_dual.need_sync_device()) { 467*076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_host(),akok->a_dual.view_host()); 468*076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 469*076ba34aSJunchao Zhang } else { /* If device has the latest data, we only copy data on device */ 470*076ba34aSJunchao Zhang Kokkos::deep_copy(bkok->a_dual.view_device(),akok->a_dual.view_device()); 471*076ba34aSJunchao Zhang bkok->a_dual.modify_device(); 472*076ba34aSJunchao Zhang } 473*076ba34aSJunchao Zhang } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */ 474*076ba34aSJunchao Zhang /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */ 475*076ba34aSJunchao Zhang bkok->a_dual.modify_host(); 476*076ba34aSJunchao Zhang } 477*076ba34aSJunchao Zhang mat->spptr = bkok; 478*076ba34aSJunchao Zhang } 479*076ba34aSJunchao Zhang 480*076ba34aSJunchao Zhang ierr = PetscFree(mat->defaultvectype);CHKERRQ(ierr); 481*076ba34aSJunchao Zhang ierr = PetscStrallocpy(VECKOKKOS,&mat->defaultvectype);CHKERRQ(ierr); /* Allocate and copy the string */ 482*076ba34aSJunchao Zhang ierr = PetscObjectChangeTypeName((PetscObject)mat,MATSEQAIJKOKKOS);CHKERRQ(ierr); 483*076ba34aSJunchao Zhang ierr = MatSetOps_SeqAIJKokkos(mat);CHKERRQ(ierr); 4848c3ff71bSJunchao Zhang PetscFunctionReturn(0); 4858c3ff71bSJunchao Zhang } 4868c3ff71bSJunchao Zhang 4878c3ff71bSJunchao Zhang static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A) 4888c3ff71bSJunchao Zhang { 4898c3ff71bSJunchao Zhang PetscErrorCode ierr; 49086a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 4918c3ff71bSJunchao Zhang 4928c3ff71bSJunchao Zhang PetscFunctionBegin; 49386a27549SJunchao Zhang if (A->factortype == MAT_FACTOR_NONE) { 49486a27549SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 4958f7e8f9dSMark Adams if (aijkok) { 4968f7e8f9dSMark Adams if (aijkok->device_mat_d.data()) { 497a587d139SMark delete aijkok->colmap_d; 498a587d139SMark delete aijkok->i_uncompressed_d; 499a587d139SMark } 5008f7e8f9dSMark Adams if (aijkok->diag_d) delete aijkok->diag_d; 5018f7e8f9dSMark Adams } 5028c3ff71bSJunchao Zhang delete aijkok; 50386a27549SJunchao Zhang } else { 50486a27549SJunchao Zhang delete static_cast<Mat_SeqAIJKokkosTriFactors*>(A->spptr); 50586a27549SJunchao Zhang } 506152b3e56SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)A,"MatFactorGetSolverType_C",NULL);CHKERRQ(ierr); 507152b3e56SJunchao Zhang A->spptr = NULL; 5088c3ff71bSJunchao Zhang ierr = MatDestroy_SeqAIJ(A);CHKERRQ(ierr); 5098c3ff71bSJunchao Zhang PetscFunctionReturn(0); 5108c3ff71bSJunchao Zhang } 5118c3ff71bSJunchao Zhang 51286a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A) 51386a27549SJunchao Zhang { 51486a27549SJunchao Zhang PetscErrorCode ierr; 51586a27549SJunchao Zhang 51686a27549SJunchao Zhang PetscFunctionBegin; 51786a27549SJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 51886a27549SJunchao Zhang ierr = MatCreate_SeqAIJ(A);CHKERRQ(ierr); 51986a27549SJunchao Zhang ierr = MatConvert_SeqAIJ_SeqAIJKokkos(A,MATSEQAIJKOKKOS,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 52086a27549SJunchao Zhang PetscFunctionReturn(0); 52186a27549SJunchao Zhang } 52286a27549SJunchao Zhang 523*076ba34aSJunchao Zhang /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */ 524*076ba34aSJunchao Zhang PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat* C) 525a3f881fbSStefano Zampini { 526*076ba34aSJunchao Zhang PetscErrorCode ierr; 527*076ba34aSJunchao Zhang Mat_SeqAIJ *a,*b; 528*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 529*076ba34aSJunchao Zhang MatScalarKokkosView aa,ba,ca; 530*076ba34aSJunchao Zhang MatRowMapKokkosView ai,bi,ci; 531*076ba34aSJunchao Zhang MatColIdxKokkosView aj,bj,cj; 532*076ba34aSJunchao Zhang PetscInt m,n,nnz,aN; 533a3f881fbSStefano Zampini 534a3f881fbSStefano Zampini PetscFunctionBegin; 535*076ba34aSJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 536*076ba34aSJunchao Zhang PetscValidHeaderSpecific(B,MAT_CLASSID,2); 537*076ba34aSJunchao Zhang PetscValidPointer(C,4); 538*076ba34aSJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 539*076ba34aSJunchao Zhang PetscCheckTypeName(B,MATSEQAIJKOKKOS); 540*076ba34aSJunchao Zhang if (A->rmap->n != B->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Invalid number or rows %D != %D",A->rmap->n,B->rmap->n); 541*076ba34aSJunchao Zhang if (reuse == MAT_INPLACE_MATRIX) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MAT_INPLACE_MATRIX not supported"); 542*076ba34aSJunchao Zhang 543*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 544*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 545*076ba34aSJunchao Zhang a = static_cast<Mat_SeqAIJ*>(A->data); 546*076ba34aSJunchao Zhang b = static_cast<Mat_SeqAIJ*>(B->data); 547*076ba34aSJunchao Zhang akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 548*076ba34aSJunchao Zhang bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 549*076ba34aSJunchao Zhang aa = akok->a_dual.view_device(); 550*076ba34aSJunchao Zhang ai = akok->i_dual.view_device(); 551*076ba34aSJunchao Zhang ba = bkok->a_dual.view_device(); 552*076ba34aSJunchao Zhang bi = bkok->i_dual.view_device(); 553*076ba34aSJunchao Zhang m = A->rmap->n; /* M, N and nnz of C */ 554*076ba34aSJunchao Zhang n = A->cmap->n + B->cmap->n; 555*076ba34aSJunchao Zhang nnz = a->nz + b->nz; 556*076ba34aSJunchao Zhang aN = A->cmap->n; /* N of A */ 557*076ba34aSJunchao Zhang if (reuse == MAT_INITIAL_MATRIX) { 558*076ba34aSJunchao Zhang aj = akok->j_dual.view_device(); 559*076ba34aSJunchao Zhang bj = bkok->j_dual.view_device(); 560*076ba34aSJunchao Zhang auto ca_dual = MatScalarKokkosDualView("a",aa.extent(0)+ba.extent(0)); 561*076ba34aSJunchao Zhang auto ci_dual = MatRowMapKokkosDualView("i",ai.extent(0)); 562*076ba34aSJunchao Zhang auto cj_dual = MatColIdxKokkosDualView("j",aj.extent(0)+bj.extent(0)); 563*076ba34aSJunchao Zhang ca = ca_dual.view_device(); 564*076ba34aSJunchao Zhang ci = ci_dual.view_device(); 565*076ba34aSJunchao Zhang cj = cj_dual.view_device(); 566*076ba34aSJunchao Zhang 567*076ba34aSJunchao Zhang /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */ 568*076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 569*076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 570*076ba34aSJunchao Zhang PetscInt coffset = ai(i) + bi(i), alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 571*076ba34aSJunchao Zhang 572*076ba34aSJunchao Zhang Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */ 573*076ba34aSJunchao Zhang ci(i) = coffset; 574*076ba34aSJunchao Zhang if (i == m-1) ci(m) = ai(m) + bi(m); 575*076ba34aSJunchao Zhang }); 576*076ba34aSJunchao Zhang 577*076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 578*076ba34aSJunchao Zhang if (k < alen) { 579*076ba34aSJunchao Zhang ca(coffset+k) = aa(ai(i)+k); 580*076ba34aSJunchao Zhang cj(coffset+k) = aj(ai(i)+k); 581*076ba34aSJunchao Zhang } else { 582*076ba34aSJunchao Zhang ca(coffset+k) = ba(bi(i)+k-alen); 583*076ba34aSJunchao Zhang cj(coffset+k) = bj(bi(i)+k-alen) + aN; /* Entries in B get new column indices in C */ 584*076ba34aSJunchao Zhang } 585*076ba34aSJunchao Zhang }); 586*076ba34aSJunchao Zhang }); 587*076ba34aSJunchao Zhang ca_dual.modify_device(); 588*076ba34aSJunchao Zhang ci_dual.modify_device(); 589*076ba34aSJunchao Zhang cj_dual.modify_device(); 590*076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(m,n,nnz,ci_dual,cj_dual,ca_dual)); 591*076ba34aSJunchao Zhang ierr = MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF,ckok,C);CHKERRQ(ierr); 592*076ba34aSJunchao Zhang } else if (reuse == MAT_REUSE_MATRIX) { 593*076ba34aSJunchao Zhang PetscValidHeaderSpecific(*C,MAT_CLASSID,4); 594*076ba34aSJunchao Zhang PetscCheckTypeName(*C,MATSEQAIJKOKKOS); 595*076ba34aSJunchao Zhang ckok = static_cast<Mat_SeqAIJKokkos*>((*C)->spptr); 596*076ba34aSJunchao Zhang ca = ckok->a_dual.view_device(); 597*076ba34aSJunchao Zhang ci = ckok->i_dual.view_device(); 598*076ba34aSJunchao Zhang 599*076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamPolicy<>(m, Kokkos::AUTO()),KOKKOS_LAMBDA(const KokkosTeamMemberType& t) { 600*076ba34aSJunchao Zhang PetscInt i = t.league_rank(); /* row i */ 601*076ba34aSJunchao Zhang PetscInt alen = ai(i+1)-ai(i), blen = bi(i+1)-bi(i); 602*076ba34aSJunchao Zhang Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen+blen), [&](PetscInt k) { 603*076ba34aSJunchao Zhang if (k < alen) ca(ci(i)+k) = aa(ai(i)+k); 604*076ba34aSJunchao Zhang else ca(ci(i)+k) = ba(bi(i)+k-alen); 605*076ba34aSJunchao Zhang }); 606*076ba34aSJunchao Zhang }); 607*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(*C);CHKERRQ(ierr); 608*076ba34aSJunchao Zhang } 609*076ba34aSJunchao Zhang PetscFunctionReturn(0); 610*076ba34aSJunchao Zhang } 611*076ba34aSJunchao Zhang 612*076ba34aSJunchao Zhang static PetscErrorCode MatProductDataDestroy_SeqAIJKokkos(void* pdata) 613*076ba34aSJunchao Zhang { 614*076ba34aSJunchao Zhang PetscFunctionBegin; 615*076ba34aSJunchao Zhang delete static_cast<MatProductData_SeqAIJKokkos*>(pdata); 616a3f881fbSStefano Zampini PetscFunctionReturn(0); 617a3f881fbSStefano Zampini } 618a3f881fbSStefano Zampini 619a3f881fbSStefano Zampini static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C) 620a3f881fbSStefano Zampini { 621*076ba34aSJunchao Zhang PetscErrorCode ierr; 622a3f881fbSStefano Zampini Mat_Product *product = C->product; 623a3f881fbSStefano Zampini Mat A,B; 624*076ba34aSJunchao Zhang bool transA,transB; /* use bool, since KK needs this type */ 625a3f881fbSStefano Zampini Mat_SeqAIJKokkos *akok,*bkok,*ckok; 626a3f881fbSStefano Zampini Mat_SeqAIJ *c; 627*076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 628*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB; 629a3f881fbSStefano Zampini 630a3f881fbSStefano Zampini PetscFunctionBegin; 631a3f881fbSStefano Zampini MatCheckProduct(C,1); 632a3f881fbSStefano Zampini if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty"); 633*076ba34aSJunchao Zhang pdata = static_cast<MatProductData_SeqAIJKokkos*>(C->product->data); 634*076ba34aSJunchao Zhang 635*076ba34aSJunchao Zhang if (pdata->reusesym) { /* We reached here through e.g., MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C), where symbolic/numeric are combined */ 636*076ba34aSJunchao Zhang pdata->reusesym = PETSC_FALSE; /* So that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C), we still do numeric */ 637*076ba34aSJunchao Zhang PetscFunctionReturn(0); 638*076ba34aSJunchao Zhang } 639*076ba34aSJunchao Zhang 640*076ba34aSJunchao Zhang switch (product->type) { 641*076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 642*076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 643*076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 644*076ba34aSJunchao Zhang default: 645*076ba34aSJunchao Zhang SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 646*076ba34aSJunchao Zhang } 647*076ba34aSJunchao Zhang 648a3f881fbSStefano Zampini A = product->A; 649a3f881fbSStefano Zampini B = product->B; 650a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 651a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 652a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 653a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 654a3f881fbSStefano Zampini ckok = static_cast<Mat_SeqAIJKokkos*>(C->spptr); 655*076ba34aSJunchao Zhang 656*076ba34aSJunchao Zhang if (!ckok) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Device data structure spptr is empty"); 657*076ba34aSJunchao Zhang 658*076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 659*076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 660*076ba34aSJunchao Zhang 661*076ba34aSJunchao Zhang /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */ 662*076ba34aSJunchao Zhang if (transA) { 663*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 664*076ba34aSJunchao Zhang transA = false; 665a3f881fbSStefano Zampini } 666a3f881fbSStefano Zampini 667*076ba34aSJunchao Zhang if (transB) { 668*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 669*076ba34aSJunchao Zhang transB = false; 670*076ba34aSJunchao Zhang } 671*076ba34aSJunchao Zhang 672*076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,ckok->csrmat)); 673*076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(ckok->csrmat)); /* without the sort, mat_tests-ex62_14_seqaijkokkos failed */ 674*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(C);CHKERRQ(ierr); 675a3f881fbSStefano Zampini /* shorter version of MatAssemblyEnd_SeqAIJ */ 676a3f881fbSStefano Zampini c = (Mat_SeqAIJ*)C->data; 677a3f881fbSStefano Zampini ierr = PetscInfo3(C,"Matrix size: %D X %D; storage space: 0 unneeded,%D used\n",C->rmap->n,C->cmap->n,c->nz);CHKERRQ(ierr); 678a3f881fbSStefano Zampini ierr = PetscInfo(C,"Number of mallocs during MatSetValues() is 0\n");CHKERRQ(ierr); 679a3f881fbSStefano Zampini ierr = PetscInfo1(C,"Maximum nonzeros in any row is %D\n",c->rmax);CHKERRQ(ierr); 680a3f881fbSStefano Zampini c->reallocs = 0; 681*076ba34aSJunchao Zhang C->info.mallocs = 0; 682a3f881fbSStefano Zampini C->info.nz_unneeded = 0; 683a3f881fbSStefano Zampini C->assembled = C->was_assembled = PETSC_TRUE; 684a3f881fbSStefano Zampini C->num_ass++; 685a3f881fbSStefano Zampini PetscFunctionReturn(0); 686a3f881fbSStefano Zampini } 687a3f881fbSStefano Zampini 688a3f881fbSStefano Zampini static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C) 689a3f881fbSStefano Zampini { 690a3f881fbSStefano Zampini PetscErrorCode ierr; 691*076ba34aSJunchao Zhang Mat_Product *product = C->product; 692*076ba34aSJunchao Zhang MatProductType ptype; 693*076ba34aSJunchao Zhang Mat A,B; 694*076ba34aSJunchao Zhang bool transA,transB; 695*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *akok,*bkok,*ckok; 696*076ba34aSJunchao Zhang MatProductData_SeqAIJKokkos *pdata; 697*076ba34aSJunchao Zhang MPI_Comm comm; 698*076ba34aSJunchao Zhang KokkosCsrMatrix *csrmatA,*csrmatB,csrmatC; 699a3f881fbSStefano Zampini 700a3f881fbSStefano Zampini PetscFunctionBegin; 701a3f881fbSStefano Zampini MatCheckProduct(C,1); 702*076ba34aSJunchao Zhang ierr = PetscObjectGetComm((PetscObject)C,&comm); 703*076ba34aSJunchao Zhang if (product->data) SETERRQ(comm,PETSC_ERR_PLIB,"Product data not empty"); 704a3f881fbSStefano Zampini A = product->A; 705a3f881fbSStefano Zampini B = product->B; 706a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 707a3f881fbSStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); 708a3f881fbSStefano Zampini akok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 709a3f881fbSStefano Zampini bkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 710*076ba34aSJunchao Zhang csrmatA = &akok->csrmat; 711*076ba34aSJunchao Zhang csrmatB = &bkok->csrmat; 712*076ba34aSJunchao Zhang 713a3f881fbSStefano Zampini ptype = product->type; 714a3f881fbSStefano Zampini switch (ptype) { 715*076ba34aSJunchao Zhang case MATPRODUCT_AB: transA = false; transB = false; break; 716*076ba34aSJunchao Zhang case MATPRODUCT_AtB: transA = true; transB = false; break; 717*076ba34aSJunchao Zhang case MATPRODUCT_ABt: transA = false; transB = true; break; 718a3f881fbSStefano Zampini default: 719*076ba34aSJunchao Zhang SETERRQ1(comm,PETSC_ERR_PLIB,"Unsupported product type %s",MatProductTypes[product->type]); 720a3f881fbSStefano Zampini } 721a3f881fbSStefano Zampini 722*076ba34aSJunchao Zhang product->data = pdata = new MatProductData_SeqAIJKokkos(); 723*076ba34aSJunchao Zhang pdata->kh.set_team_work_size(16); 724*076ba34aSJunchao Zhang pdata->kh.set_dynamic_scheduling(true); 725*076ba34aSJunchao Zhang pdata->reusesym = product->api_user; 726a3f881fbSStefano Zampini 727*076ba34aSJunchao Zhang /* TODO: add command line options to select spgemm algorithms */ 728*076ba34aSJunchao Zhang auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK; 729*076ba34aSJunchao Zhang #if defined(PETSC_HAVE_CUDA) 730*076ba34aSJunchao Zhang #if PETSC_PKG_CUDA_VERSION_GE(11,0,0) 731*076ba34aSJunchao Zhang /* This algorithm + cuda-10.2 sometimes gave wrong results (invalid device pointers in csrmatC) and failed snes/tutorials/ex56.c */ 732*076ba34aSJunchao Zhang spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_CUSPARSE; 733*076ba34aSJunchao Zhang #endif 734*076ba34aSJunchao Zhang #endif 735*076ba34aSJunchao Zhang pdata->kh.create_spgemm_handle(spgemm_alg); 736*076ba34aSJunchao Zhang 737*076ba34aSJunchao Zhang /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */ 738*076ba34aSJunchao Zhang if (transA) { 739*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(A,&csrmatA);CHKERRQ(ierr); 740*076ba34aSJunchao Zhang transA = false; 741*076ba34aSJunchao Zhang } 742*076ba34aSJunchao Zhang 743*076ba34aSJunchao Zhang if (transB) { 744*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosGenerateTranspose_Private(B,&csrmatB);CHKERRQ(ierr); 745*076ba34aSJunchao Zhang transB = false; 746*076ba34aSJunchao Zhang } 747*076ba34aSJunchao Zhang 748*076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_symbolic(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 749*076ba34aSJunchao Zhang /* spgemm_symbolic() only populates C's rowmap, but not C's column indices. 750*076ba34aSJunchao Zhang So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before 751*076ba34aSJunchao Zhang calling new Mat_SeqAIJKokkos(). 752*076ba34aSJunchao Zhang TODO: Remove the fake spgemm_numeric() after KK fixed this problem. 753*076ba34aSJunchao Zhang */ 754*076ba34aSJunchao Zhang CHKERRCXX(KokkosSparse::spgemm_numeric(pdata->kh,*csrmatA,transA,*csrmatB,transB,csrmatC)); 755*076ba34aSJunchao Zhang CHKERRCXX(KokkosKernels::sort_crs_matrix(csrmatC)); 756*076ba34aSJunchao Zhang 757*076ba34aSJunchao Zhang CHKERRCXX(ckok = new Mat_SeqAIJKokkos(csrmatC)); 758*076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(C,ckok);CHKERRQ(ierr); 759*076ba34aSJunchao Zhang C->product->destroy = MatProductDataDestroy_SeqAIJKokkos; 760a3f881fbSStefano Zampini PetscFunctionReturn(0); 761a3f881fbSStefano Zampini } 762a3f881fbSStefano Zampini 763a3f881fbSStefano Zampini /* handles sparse matrix matrix ops */ 764*076ba34aSJunchao Zhang static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat) 765a3f881fbSStefano Zampini { 766a3f881fbSStefano Zampini PetscErrorCode ierr; 767*076ba34aSJunchao Zhang Mat_Product *product = mat->product; 768a3f881fbSStefano Zampini PetscBool Biskok = PETSC_FALSE,Ciskok = PETSC_TRUE; 769a3f881fbSStefano Zampini 770a3f881fbSStefano Zampini PetscFunctionBegin; 771a3f881fbSStefano Zampini MatCheckProduct(mat,1); 772a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->B,MATSEQAIJKOKKOS,&Biskok);CHKERRQ(ierr); 773a3f881fbSStefano Zampini if (product->type == MATPRODUCT_ABC) { 774a3f881fbSStefano Zampini ierr = PetscObjectTypeCompare((PetscObject)product->C,MATSEQAIJKOKKOS,&Ciskok);CHKERRQ(ierr); 775a3f881fbSStefano Zampini } 776a3f881fbSStefano Zampini if (Biskok && Ciskok) { 777a3f881fbSStefano Zampini switch (product->type) { 778a3f881fbSStefano Zampini case MATPRODUCT_AB: 779a3f881fbSStefano Zampini case MATPRODUCT_AtB: 780a3f881fbSStefano Zampini case MATPRODUCT_ABt: 781a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos; 782a3f881fbSStefano Zampini break; 783a3f881fbSStefano Zampini case MATPRODUCT_PtAP: 784a3f881fbSStefano Zampini case MATPRODUCT_RARt: 785a3f881fbSStefano Zampini case MATPRODUCT_ABC: 786a3f881fbSStefano Zampini mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic; 787a3f881fbSStefano Zampini break; 788a3f881fbSStefano Zampini default: 789a3f881fbSStefano Zampini break; 790a3f881fbSStefano Zampini } 791a3f881fbSStefano Zampini } else { /* fallback for AIJ */ 792a3f881fbSStefano Zampini ierr = MatProductSetFromOptions_SeqAIJ(mat);CHKERRQ(ierr); 793a3f881fbSStefano Zampini } 794a3f881fbSStefano Zampini PetscFunctionReturn(0); 795a3f881fbSStefano Zampini } 796a587d139SMark 797f0cf5187SStefano Zampini static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a) 798f0cf5187SStefano Zampini { 799f0cf5187SStefano Zampini PetscErrorCode ierr; 800f0cf5187SStefano Zampini Mat_SeqAIJKokkos *aijkok; 801f0cf5187SStefano Zampini 802f0cf5187SStefano Zampini PetscFunctionBegin; 803f0cf5187SStefano Zampini ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 804f0cf5187SStefano Zampini aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 805*076ba34aSJunchao Zhang KokkosBlas::scal(aijkok->a_dual.view_device(),a,aijkok->a_dual.view_device()); 806*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 807f0cf5187SStefano Zampini ierr = WaitForKokkos();CHKERRQ(ierr); 808*076ba34aSJunchao Zhang ierr = PetscLogGpuFlops(aijkok->a_dual.extent(0));CHKERRQ(ierr); 809f0cf5187SStefano Zampini PetscFunctionReturn(0); 810f0cf5187SStefano Zampini } 811f0cf5187SStefano Zampini 812a587d139SMark static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A) 813a587d139SMark { 814a587d139SMark PetscErrorCode ierr; 815*076ba34aSJunchao Zhang Mat_SeqAIJKokkos *aijkok; 816a587d139SMark 817a587d139SMark PetscFunctionBegin; 818*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 819*076ba34aSJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 820*076ba34aSJunchao Zhang KokkosBlas::fill(aijkok->a_dual.view_device(),0.0); 821*076ba34aSJunchao Zhang ierr = MatSeqAIJInvalidateDiagonal(A);CHKERRQ(ierr); /* Cached diagonal values are invalided */ 822*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 823a587d139SMark PetscFunctionReturn(0); 824a587d139SMark } 825a587d139SMark 826db78de30SJunchao Zhang /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */ 827db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 828db78de30SJunchao Zhang { 829db78de30SJunchao Zhang PetscErrorCode ierr; 830db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 831db78de30SJunchao Zhang 832db78de30SJunchao Zhang PetscFunctionBegin; 833db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 834db78de30SJunchao Zhang PetscValidPointer(kv,2); 835db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 836db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 837db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 838*076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 839db78de30SJunchao Zhang PetscFunctionReturn(0); 840db78de30SJunchao Zhang } 841db78de30SJunchao Zhang 842db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,ConstPetscScalarKokkosView* kv) 843db78de30SJunchao Zhang { 844db78de30SJunchao Zhang PetscFunctionBegin; 845db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 846db78de30SJunchao Zhang PetscValidPointer(kv,2); 847db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 848db78de30SJunchao Zhang PetscFunctionReturn(0); 849db78de30SJunchao Zhang } 850db78de30SJunchao Zhang 851db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosView(Mat A,PetscScalarKokkosView* kv) 852db78de30SJunchao Zhang { 853db78de30SJunchao Zhang PetscErrorCode ierr; 854db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 855db78de30SJunchao Zhang 856db78de30SJunchao Zhang PetscFunctionBegin; 857db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 858db78de30SJunchao Zhang PetscValidPointer(kv,2); 859db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 860db78de30SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 861db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 862*076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 863db78de30SJunchao Zhang PetscFunctionReturn(0); 864db78de30SJunchao Zhang } 865db78de30SJunchao Zhang 866db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A,PetscScalarKokkosView* kv) 867db78de30SJunchao Zhang { 868db78de30SJunchao Zhang PetscErrorCode ierr; 869db78de30SJunchao Zhang 870db78de30SJunchao Zhang PetscFunctionBegin; 871db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 872db78de30SJunchao Zhang PetscValidPointer(kv,2); 873db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 874*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 875db78de30SJunchao Zhang PetscFunctionReturn(0); 876db78de30SJunchao Zhang } 877db78de30SJunchao Zhang 878db78de30SJunchao Zhang PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 879db78de30SJunchao Zhang { 880db78de30SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 881db78de30SJunchao Zhang 882db78de30SJunchao Zhang PetscFunctionBegin; 883db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 884db78de30SJunchao Zhang PetscValidPointer(kv,2); 885db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 886db78de30SJunchao Zhang aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr); 887*076ba34aSJunchao Zhang *kv = aijkok->a_dual.view_device(); 888db78de30SJunchao Zhang PetscFunctionReturn(0); 889db78de30SJunchao Zhang } 890db78de30SJunchao Zhang 891db78de30SJunchao Zhang PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A,PetscScalarKokkosView* kv) 892db78de30SJunchao Zhang { 893db78de30SJunchao Zhang PetscErrorCode ierr; 894db78de30SJunchao Zhang 895db78de30SJunchao Zhang PetscFunctionBegin; 896db78de30SJunchao Zhang PetscValidHeaderSpecific(A,MAT_CLASSID,1); 897db78de30SJunchao Zhang PetscValidPointer(kv,2); 898db78de30SJunchao Zhang PetscCheckTypeName(A,MATSEQAIJKOKKOS); 899*076ba34aSJunchao Zhang ierr = MatSeqAIJKokkosModifyDevice(A);CHKERRQ(ierr); 900db78de30SJunchao Zhang PetscFunctionReturn(0); 901db78de30SJunchao Zhang } 902db78de30SJunchao Zhang 903db78de30SJunchao Zhang /* Computes Y += a*X */ 904f0cf5187SStefano Zampini static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar a,Mat X,MatStructure str) 905a587d139SMark { 906a587d139SMark PetscErrorCode ierr; 907a587d139SMark Mat_SeqAIJ *x = (Mat_SeqAIJ*)X->data,*y = (Mat_SeqAIJ*)Y->data; 908a587d139SMark 909a587d139SMark PetscFunctionBegin; 910f0cf5187SStefano Zampini if (X->ops->axpy != Y->ops->axpy) { 911a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 912a587d139SMark PetscFunctionReturn(0); 913a587d139SMark } 914db78de30SJunchao Zhang 915f0cf5187SStefano Zampini if (str != SAME_NONZERO_PATTERN && x->nz == y->nz) { 916a587d139SMark PetscBool e; 917a587d139SMark ierr = PetscArraycmp(x->i,y->i,Y->rmap->n+1,&e);CHKERRQ(ierr); 918a587d139SMark if (e) { 919a587d139SMark ierr = PetscArraycmp(x->j,y->j,y->nz,&e);CHKERRQ(ierr); 920f0cf5187SStefano Zampini if (e) str = SAME_NONZERO_PATTERN; 921a587d139SMark } 922a587d139SMark } 923db78de30SJunchao Zhang 924a587d139SMark if (str != SAME_NONZERO_PATTERN) { 925db78de30SJunchao Zhang /* TODO: do MatAXPY on device */ 926a587d139SMark ierr = MatAXPY_SeqAIJ(Y,a,X,str);CHKERRQ(ierr); 927a587d139SMark PetscFunctionReturn(0); 928a587d139SMark } else { 929db78de30SJunchao Zhang ConstPetscScalarKokkosView xv; 930db78de30SJunchao Zhang PetscScalarKokkosView yv; 931db78de30SJunchao Zhang 932db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(X,&xv);CHKERRQ(ierr); 933db78de30SJunchao Zhang ierr = MatSeqAIJGetKokkosView(Y,&yv);CHKERRQ(ierr); 934db78de30SJunchao Zhang KokkosBlas::axpy(a,xv,yv); 935db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(X,&xv);CHKERRQ(ierr); 936db78de30SJunchao Zhang ierr = MatSeqAIJRestoreKokkosView(Y,&yv);CHKERRQ(ierr); 937a587d139SMark } 938a587d139SMark PetscFunctionReturn(0); 939a587d139SMark } 940a587d139SMark 94186a27549SJunchao Zhang static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 9428f7e8f9dSMark Adams { 9438f7e8f9dSMark Adams PetscErrorCode ierr; 9448f7e8f9dSMark Adams 9458f7e8f9dSMark Adams PetscFunctionBegin; 9468f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(A);CHKERRQ(ierr); 9478f7e8f9dSMark Adams ierr = MatLUFactorNumeric_SeqAIJ(B,A,info);CHKERRQ(ierr); 9488f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_CPU; 9498f7e8f9dSMark Adams PetscFunctionReturn(0); 9508f7e8f9dSMark Adams } 9518f7e8f9dSMark Adams 9528c3ff71bSJunchao Zhang static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A) 9538c3ff71bSJunchao Zhang { 954*076ba34aSJunchao Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 955*076ba34aSJunchao Zhang 9568c3ff71bSJunchao Zhang PetscFunctionBegin; 957*076ba34aSJunchao Zhang A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */ 9586f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 9596f3d89d0SStefano Zampini 9608c3ff71bSJunchao Zhang A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos; 9618c3ff71bSJunchao Zhang A->ops->destroy = MatDestroy_SeqAIJKokkos; 9628c3ff71bSJunchao Zhang A->ops->duplicate = MatDuplicate_SeqAIJKokkos; 963a587d139SMark A->ops->axpy = MatAXPY_SeqAIJKokkos; 964f0cf5187SStefano Zampini A->ops->scale = MatScale_SeqAIJKokkos; 965a587d139SMark A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos; 966*076ba34aSJunchao Zhang A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos; 9678c3ff71bSJunchao Zhang A->ops->mult = MatMult_SeqAIJKokkos; 9688c3ff71bSJunchao Zhang A->ops->multadd = MatMultAdd_SeqAIJKokkos; 9698c3ff71bSJunchao Zhang A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos; 9708c3ff71bSJunchao Zhang A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos; 9718c3ff71bSJunchao Zhang A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos; 9728c3ff71bSJunchao Zhang A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos; 973*076ba34aSJunchao Zhang A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos; 974152b3e56SJunchao Zhang A->ops->setoption = MatSetOption_SeqAIJKokkos; 975*076ba34aSJunchao Zhang a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos; 976*076ba34aSJunchao Zhang a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos; 977*076ba34aSJunchao Zhang a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos; 978*076ba34aSJunchao Zhang a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos; 979*076ba34aSJunchao Zhang a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos; 980*076ba34aSJunchao Zhang a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos; 981*076ba34aSJunchao Zhang PetscFunctionReturn(0); 982*076ba34aSJunchao Zhang } 983*076ba34aSJunchao Zhang 984*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos *akok) 985*076ba34aSJunchao Zhang { 986*076ba34aSJunchao Zhang PetscErrorCode ierr; 987*076ba34aSJunchao Zhang Mat_SeqAIJ *aseq; 988*076ba34aSJunchao Zhang PetscInt i,m,n; 989*076ba34aSJunchao Zhang 990*076ba34aSJunchao Zhang PetscFunctionBegin; 991*076ba34aSJunchao Zhang if (A->spptr) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"A->spptr is supposed to be empty"); 992*076ba34aSJunchao Zhang 993*076ba34aSJunchao Zhang m = akok->nrows(); 994*076ba34aSJunchao Zhang n = akok->ncols(); 995*076ba34aSJunchao Zhang ierr = MatSetSizes(A,m,n,m,n);CHKERRQ(ierr); 996*076ba34aSJunchao Zhang ierr = MatSetType(A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 997*076ba34aSJunchao Zhang 998*076ba34aSJunchao Zhang /* Set up data structures of A as a MATSEQAIJ */ 999*076ba34aSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(A,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 1000*076ba34aSJunchao Zhang aseq = (Mat_SeqAIJ*)(A)->data; 1001*076ba34aSJunchao Zhang 1002*076ba34aSJunchao Zhang akok->i_dual.sync_host(); /* We always need sync'ed i, j on host */ 1003*076ba34aSJunchao Zhang akok->j_dual.sync_host(); 1004*076ba34aSJunchao Zhang 1005*076ba34aSJunchao Zhang aseq->i = akok->i_host_data(); 1006*076ba34aSJunchao Zhang aseq->j = akok->j_host_data(); 1007*076ba34aSJunchao Zhang aseq->a = akok->a_host_data(); 1008*076ba34aSJunchao Zhang aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/ 1009*076ba34aSJunchao Zhang aseq->singlemalloc = PETSC_FALSE; 1010*076ba34aSJunchao Zhang aseq->free_a = PETSC_FALSE; 1011*076ba34aSJunchao Zhang aseq->free_ij = PETSC_FALSE; 1012*076ba34aSJunchao Zhang aseq->nz = akok->nnz(); 1013*076ba34aSJunchao Zhang aseq->maxnz = aseq->nz; 1014*076ba34aSJunchao Zhang 1015*076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->imax);CHKERRQ(ierr); 1016*076ba34aSJunchao Zhang ierr = PetscMalloc1(m,&aseq->ilen);CHKERRQ(ierr); 1017*076ba34aSJunchao Zhang for (i=0; i<m; i++) { 1018*076ba34aSJunchao Zhang aseq->ilen[i] = aseq->imax[i] = aseq->i[i+1] - aseq->i[i]; 1019*076ba34aSJunchao Zhang } 1020*076ba34aSJunchao Zhang 1021*076ba34aSJunchao Zhang /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */ 1022*076ba34aSJunchao Zhang akok->nonzerostate = A->nonzerostate; 1023*076ba34aSJunchao Zhang ierr = MatAssemblyBegin(A,MAT_FINAL_ASSEMBLY); 1024*076ba34aSJunchao Zhang ierr = MatAssemblyEnd(A,MAT_FINAL_ASSEMBLY); 1025*076ba34aSJunchao Zhang A->spptr = akok; 1026*076ba34aSJunchao Zhang PetscFunctionReturn(0); 1027*076ba34aSJunchao Zhang } 1028*076ba34aSJunchao Zhang 1029*076ba34aSJunchao Zhang /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure 1030*076ba34aSJunchao Zhang 1031*076ba34aSJunchao Zhang Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR 1032*076ba34aSJunchao Zhang */ 1033*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos *akok,Mat *A) 1034*076ba34aSJunchao Zhang { 1035*076ba34aSJunchao Zhang PetscErrorCode ierr; 1036*076ba34aSJunchao Zhang 1037*076ba34aSJunchao Zhang PetscFunctionBegin; 1038*076ba34aSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 1039*076ba34aSJunchao Zhang ierr = MatSetSeqAIJKokkosWithCSRMatrix(*A,akok);CHKERRQ(ierr); 10408c3ff71bSJunchao Zhang PetscFunctionReturn(0); 10418c3ff71bSJunchao Zhang } 10428c3ff71bSJunchao Zhang 10438c3ff71bSJunchao Zhang /* --------------------------------------------------------------------------------*/ 1044152b3e56SJunchao Zhang /*@C 10458c3ff71bSJunchao Zhang MatCreateSeqAIJKokkos - Creates a sparse matrix in AIJ (compressed row) format 10468c3ff71bSJunchao Zhang (the default parallel PETSc format). This matrix will ultimately be handled by 10478c3ff71bSJunchao Zhang Kokkos for calculations. For good matrix 10488c3ff71bSJunchao Zhang assembly performance the user should preallocate the matrix storage by setting 10498c3ff71bSJunchao Zhang the parameter nz (or the array nnz). By setting these parameters accurately, 10508c3ff71bSJunchao Zhang performance during matrix assembly can be increased by more than a factor of 50. 10518c3ff71bSJunchao Zhang 10528c3ff71bSJunchao Zhang Collective 10538c3ff71bSJunchao Zhang 10548c3ff71bSJunchao Zhang Input Parameters: 10558c3ff71bSJunchao Zhang + comm - MPI communicator, set to PETSC_COMM_SELF 10568c3ff71bSJunchao Zhang . m - number of rows 10578c3ff71bSJunchao Zhang . n - number of columns 10588c3ff71bSJunchao Zhang . nz - number of nonzeros per row (same for all rows) 10598c3ff71bSJunchao Zhang - nnz - array containing the number of nonzeros in the various rows 10608c3ff71bSJunchao Zhang (possibly different for each row) or NULL 10618c3ff71bSJunchao Zhang 10628c3ff71bSJunchao Zhang Output Parameter: 10638c3ff71bSJunchao Zhang . A - the matrix 10648c3ff71bSJunchao Zhang 10658c3ff71bSJunchao Zhang It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 10668c3ff71bSJunchao Zhang MatXXXXSetPreallocation() paradgm instead of this routine directly. 10678c3ff71bSJunchao Zhang [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 10688c3ff71bSJunchao Zhang 10698c3ff71bSJunchao Zhang Notes: 10708c3ff71bSJunchao Zhang If nnz is given then nz is ignored 10718c3ff71bSJunchao Zhang 10728c3ff71bSJunchao Zhang The AIJ format (also called the Yale sparse matrix format or 10738c3ff71bSJunchao Zhang compressed row storage), is fully compatible with standard Fortran 77 10748c3ff71bSJunchao Zhang storage. That is, the stored row and column indices can begin at 10758c3ff71bSJunchao Zhang either one (as in Fortran) or zero. See the users' manual for details. 10768c3ff71bSJunchao Zhang 10778c3ff71bSJunchao Zhang Specify the preallocated storage with either nz or nnz (not both). 10788c3ff71bSJunchao Zhang Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 10798c3ff71bSJunchao Zhang allocation. For large problems you MUST preallocate memory or you 10808c3ff71bSJunchao Zhang will get TERRIBLE performance, see the users' manual chapter on matrices. 10818c3ff71bSJunchao Zhang 10828c3ff71bSJunchao Zhang By default, this format uses inodes (identical nodes) when possible, to 10838c3ff71bSJunchao Zhang improve numerical efficiency of matrix-vector products and solves. We 10848c3ff71bSJunchao Zhang search for consecutive rows with the same nonzero structure, thereby 10858c3ff71bSJunchao Zhang reusing matrix information to achieve increased efficiency. 10868c3ff71bSJunchao Zhang 10878c3ff71bSJunchao Zhang Level: intermediate 10888c3ff71bSJunchao Zhang 1089*076ba34aSJunchao Zhang .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ() 10908c3ff71bSJunchao Zhang @*/ 10918c3ff71bSJunchao Zhang PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A) 10928c3ff71bSJunchao Zhang { 10938c3ff71bSJunchao Zhang PetscErrorCode ierr; 10948c3ff71bSJunchao Zhang 10958c3ff71bSJunchao Zhang PetscFunctionBegin; 10968c3ff71bSJunchao Zhang ierr = PetscKokkosInitializeCheck();CHKERRQ(ierr); 10978c3ff71bSJunchao Zhang ierr = MatCreate(comm,A);CHKERRQ(ierr); 10988c3ff71bSJunchao Zhang ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr); 10998c3ff71bSJunchao Zhang ierr = MatSetType(*A,MATSEQAIJKOKKOS);CHKERRQ(ierr); 11008c3ff71bSJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr); 11018c3ff71bSJunchao Zhang PetscFunctionReturn(0); 11028c3ff71bSJunchao Zhang } 1103930e68a5SMark Adams 11048f7e8f9dSMark Adams typedef Kokkos::TeamPolicy<>::member_type team_member; 11058f7e8f9dSMark Adams // 11068f7e8f9dSMark Adams // This factorization exploits block diagonal matrices with "Nf" attached to the matrix in a container. 11078f7e8f9dSMark Adams // Use -pc_factor_mat_ordering_type rcm to order decouple blocks of size N/Nf for this optimization 11088f7e8f9dSMark Adams // 11098f7e8f9dSMark Adams static PetscErrorCode MatLUFactorNumeric_SeqAIJKOKKOSDEVICE(Mat B,Mat A,const MatFactorInfo *info) 1110930e68a5SMark Adams { 11118f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 11128f7e8f9dSMark Adams Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos*>(A->spptr), *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 11138f7e8f9dSMark Adams IS isrow = b->row,isicol = b->icol; 1114930e68a5SMark Adams PetscErrorCode ierr; 11158f7e8f9dSMark Adams const PetscInt *r_h,*ic_h; 1116*076ba34aSJunchao Zhang const PetscInt n=A->rmap->n, *ai_d=aijkok->i_dual.view_device().data(), *aj_d=aijkok->j_dual.view_device().data(), *bi_d=baijkok->i_dual.view_device().data(), *bj_d=baijkok->j_dual.view_device().data(), *bdiag_d = baijkok->diag_d->data(); 1117*076ba34aSJunchao Zhang const PetscScalar *aa_d = aijkok->a_dual.view_device().data(); 1118*076ba34aSJunchao Zhang PetscScalar *ba_d = baijkok->a_dual.view_device().data(); 11198f7e8f9dSMark Adams PetscBool row_identity,col_identity; 11208f7e8f9dSMark Adams PetscInt nc, Nf, nVec=32; // should be a parameter 11218f7e8f9dSMark Adams PetscContainer container; 1122930e68a5SMark Adams 1123930e68a5SMark Adams PetscFunctionBegin; 11248f7e8f9dSMark Adams if (A->rmap->n != n) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"square matrices only supported %D %D",A->rmap->n,n); 11258f7e8f9dSMark Adams ierr = MatGetOption(A,MAT_STRUCTURALLY_SYMMETRIC,&row_identity);CHKERRQ(ierr); 11268f7e8f9dSMark Adams if (!row_identity) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"structurally symmetric matrices only supported"); 11278f7e8f9dSMark Adams ierr = PetscObjectQuery((PetscObject) A, "Nf", (PetscObject *) &container);CHKERRQ(ierr); 11288f7e8f9dSMark Adams if (container) { 11298f7e8f9dSMark Adams PetscInt *pNf=NULL, nv; 11308f7e8f9dSMark Adams ierr = PetscContainerGetPointer(container, (void **) &pNf);CHKERRQ(ierr); 11318f7e8f9dSMark Adams Nf = (*pNf)%1000; 11328f7e8f9dSMark Adams nv = (*pNf)/1000; 11338f7e8f9dSMark Adams if (nv>0) nVec = nv; 11348f7e8f9dSMark Adams } else Nf = 1; 11358f7e8f9dSMark Adams if (n%Nf) SETERRQ2(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"n % Nf != 0 %D %D",n,Nf); 11368f7e8f9dSMark Adams ierr = ISGetIndices(isrow,&r_h);CHKERRQ(ierr); 11378f7e8f9dSMark Adams ierr = ISGetIndices(isicol,&ic_h);CHKERRQ(ierr); 11388f7e8f9dSMark Adams ierr = ISGetSize(isicol,&nc);CHKERRQ(ierr); 11398f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 11408f7e8f9dSMark Adams ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 11418f7e8f9dSMark Adams #endif 11428f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 11438f7e8f9dSMark Adams { 11448f7e8f9dSMark Adams #define KOKKOS_SHARED_LEVEL 1 11458f7e8f9dSMark Adams using scr_mem_t = Kokkos::DefaultExecutionSpace::scratch_memory_space; 11468f7e8f9dSMark Adams using sizet_scr_t = Kokkos::View<size_t, scr_mem_t>; 11478f7e8f9dSMark Adams using scalar_scr_t = Kokkos::View<PetscScalar, scr_mem_t>; 11488f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_r_k (r_h, n); 11498f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_r_k ("r", n); 11508f7e8f9dSMark Adams const Kokkos::View<const PetscInt*, Kokkos::LayoutLeft, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_ic_k (ic_h, nc); 11518f7e8f9dSMark Adams Kokkos::View<PetscInt*, Kokkos::LayoutLeft> d_ic_k ("ic", nc); 11528f7e8f9dSMark Adams size_t flops_h = 0.0; 11538f7e8f9dSMark Adams Kokkos::View<size_t, Kokkos::HostSpace> h_flops_k (&flops_h); 11548f7e8f9dSMark Adams Kokkos::View<size_t> d_flops_k ("flops"); 11558f7e8f9dSMark Adams const int conc = Kokkos::DefaultExecutionSpace().concurrency(), team_size = conc > 1 ? 16 : 1; // 8*32 = 256 11568f7e8f9dSMark Adams const int nloc = n/Nf, Ni = (conc > 8) ? 1 /* some intelegent number of SMs -- but need league_barrier */ : 1; 11578f7e8f9dSMark Adams Kokkos::deep_copy (d_flops_k, h_flops_k); 11588f7e8f9dSMark Adams Kokkos::deep_copy (d_r_k, h_r_k); 11598f7e8f9dSMark Adams Kokkos::deep_copy (d_ic_k, h_ic_k); 11608f7e8f9dSMark Adams // Fill A --> fact 11618f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec), KOKKOS_LAMBDA (const team_member team) { 1162042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block = team.league_rank()%Ni; // use grid.x/y in CUDA 11638f7e8f9dSMark Adams const PetscInt nloc_i = (nloc/Ni + !!(nloc%Ni)), start_i = field*nloc + field_block*nloc_i, end_i = (start_i + nloc_i) > (field+1)*nloc ? (field+1)*nloc : (start_i + nloc_i); 11648f7e8f9dSMark Adams const PetscInt *ic = d_ic_k.data(), *r = d_r_k.data(); 11658f7e8f9dSMark Adams // zero rows of B 11668f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 11678f7e8f9dSMark Adams PetscInt nzbL = bi_d[rowb+1] - bi_d[rowb], nzbU = bdiag_d[rowb] - bdiag_d[rowb+1]; // with diag 11688f7e8f9dSMark Adams PetscScalar *baL = ba_d + bi_d[rowb]; 11698f7e8f9dSMark Adams PetscScalar *baU = ba_d + bdiag_d[rowb+1]+1; 11708f7e8f9dSMark Adams /* zero (unfactored row) */ 11718f7e8f9dSMark Adams for (int j=0;j<nzbL;j++) baL[j] = 0; 11728f7e8f9dSMark Adams for (int j=0;j<nzbU;j++) baU[j] = 0; 11738f7e8f9dSMark Adams }); 11748f7e8f9dSMark Adams // copy A into B 11758f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, start_i, end_i), [=] (const int &rowb) { 11768f7e8f9dSMark Adams PetscInt rowa = r[rowb], nza = ai_d[rowa+1] - ai_d[rowa]; 11778f7e8f9dSMark Adams const PetscScalar *av = aa_d + ai_d[rowa]; 11788f7e8f9dSMark Adams const PetscInt *ajtmp = aj_d + ai_d[rowa]; 11798f7e8f9dSMark Adams /* load in initial (unfactored row) */ 11808f7e8f9dSMark Adams for (int j=0;j<nza;j++) { 11818f7e8f9dSMark Adams PetscInt colb = ic[ajtmp[j]]; 11828f7e8f9dSMark Adams PetscScalar vala = av[j]; 11838f7e8f9dSMark Adams if (colb == rowb) { 11848f7e8f9dSMark Adams *(ba_d + bdiag_d[rowb]) = vala; 11858f7e8f9dSMark Adams } else { 11868f7e8f9dSMark Adams const PetscInt *pbj = bj_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 11878f7e8f9dSMark Adams PetscScalar *pba = ba_d + ((colb > rowb) ? bdiag_d[rowb+1]+1 : bi_d[rowb]); 11888f7e8f9dSMark Adams PetscInt nz = (colb > rowb) ? bdiag_d[rowb] - (bdiag_d[rowb+1]+1) : bi_d[rowb+1] - bi_d[rowb], set=0; 11898f7e8f9dSMark Adams for (int j=0; j<nz ; j++) { 11908f7e8f9dSMark Adams if (pbj[j] == colb) { 11918f7e8f9dSMark Adams pba[j] = vala; 11928f7e8f9dSMark Adams set++; 11938f7e8f9dSMark Adams break; 11948f7e8f9dSMark Adams } 11958f7e8f9dSMark Adams } 11968f7e8f9dSMark Adams if (set!=1) printf("\t\t\t ERROR DID NOT SET ?????\n"); 11978f7e8f9dSMark Adams } 11988f7e8f9dSMark Adams } 11998f7e8f9dSMark Adams }); 12008f7e8f9dSMark Adams }); 12018f7e8f9dSMark Adams Kokkos::fence(); 1202930e68a5SMark Adams 12038f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, team_size, nVec).set_scratch_size(KOKKOS_SHARED_LEVEL, Kokkos::PerThread(sizet_scr_t::shmem_size()+scalar_scr_t::shmem_size()), Kokkos::PerTeam(sizet_scr_t::shmem_size())), KOKKOS_LAMBDA (const team_member team) { 12048f7e8f9dSMark Adams sizet_scr_t colkIdx(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12058f7e8f9dSMark Adams scalar_scr_t L_ki(team.thread_scratch(KOKKOS_SHARED_LEVEL)); 12068f7e8f9dSMark Adams sizet_scr_t flops(team.team_scratch(KOKKOS_SHARED_LEVEL)); 1207042217e8SBarry Smith const PetscInt field = team.league_rank()/Ni, field_block_idx = team.league_rank()%Ni; // use grid.x/y in CUDA 12088f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc; 12098f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { flops() = 0; }); 12108f7e8f9dSMark Adams // A22 panel update for each row A(1,:) and col A(:,1) 12118f7e8f9dSMark Adams for (int ii=start; ii<end-1; ii++) { 12128f7e8f9dSMark Adams const PetscInt *bjUi = bj_d + bdiag_d[ii+1]+1, nzUi = bdiag_d[ii] - (bdiag_d[ii+1]+1); // vector, and vector size, of column indices of U(i,(i+1):end) 12138f7e8f9dSMark Adams const PetscScalar *baUi = ba_d + bdiag_d[ii+1]+1; // vector of data U(i,i+1:end) 12148f7e8f9dSMark Adams const PetscInt nUi_its = nzUi/Ni + !!(nzUi%Ni); 12158f7e8f9dSMark Adams const PetscScalar Bii = *(ba_d + bdiag_d[ii]); // diagonal in its special place 12168f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamThreadRange(team, nUi_its), [=] (const int j) { 12178f7e8f9dSMark Adams PetscInt kIdx = j*Ni + field_block_idx; 12188f7e8f9dSMark Adams if (kIdx >= nzUi) /* void */ ; 12198f7e8f9dSMark Adams else { 12208f7e8f9dSMark Adams const PetscInt myk = bjUi[kIdx]; // assume symmetric structure, need a transposed meta-data here in general 12218f7e8f9dSMark Adams const PetscInt *pjL = bj_d + bi_d[myk]; // look for L(myk,ii) in start of row 12228f7e8f9dSMark Adams const PetscInt nzL = bi_d[myk+1] - bi_d[myk]; // size of L_k(:) 12238f7e8f9dSMark Adams size_t st_idx; 12248f7e8f9dSMark Adams // find and do L(k,i) = A(:k,i) / A(i,i) 12258f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [&]() { colkIdx() = PETSC_MAX_INT; }); 12268f7e8f9dSMark Adams // get column, there has got to be a better way 12278f7e8f9dSMark Adams Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,nzL), [&] (const int &j, size_t &idx) { 12288f7e8f9dSMark Adams //printf("\t\t%d) in vector loop, testing col %d =? %d (ii)\n",j,pjL[j],ii); 12298f7e8f9dSMark Adams if (pjL[j] == ii) { 12308f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + j; 12318f7e8f9dSMark Adams idx = j; // output 12328f7e8f9dSMark Adams *pLki = *pLki/Bii; // column scaling: L(k,i) = A(:k,i) / A(i,i) 12338f7e8f9dSMark Adams } 12348f7e8f9dSMark Adams }, st_idx); 12358f7e8f9dSMark Adams Kokkos::single(Kokkos::PerThread(team), [=]() { colkIdx() = st_idx; L_ki() = *(ba_d + bi_d[myk] + st_idx); }); 12368f7e8f9dSMark Adams if (colkIdx() == PETSC_MAX_INT) printf("\t\t\t\t\t\t\tERROR: failed to find L_ki(%d,%d)\n",myk,ii); 12378f7e8f9dSMark Adams else { // active row k, do A_kj -= Lki * U_ij; j \in U(i,:) j != i 12388f7e8f9dSMark Adams // U(i+1,:end) 12398f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,nzUi), [=] (const int &uiIdx) { // index into i (U) 12408f7e8f9dSMark Adams PetscScalar Uij = baUi[uiIdx]; 12418f7e8f9dSMark Adams PetscInt col = bjUi[uiIdx]; 12428f7e8f9dSMark Adams if (col==myk) { 12438f7e8f9dSMark Adams // A_kk = A_kk - L_ki * U_ij(k) 12448f7e8f9dSMark Adams PetscScalar *Akkv = (ba_d + bdiag_d[myk]); // diagonal in its special place 12458f7e8f9dSMark Adams *Akkv = *Akkv - L_ki() * Uij; // UiK 12468f7e8f9dSMark Adams } else { 12478f7e8f9dSMark Adams PetscScalar *start, *end, *pAkjv=NULL; 12488f7e8f9dSMark Adams PetscInt high, low; 12498f7e8f9dSMark Adams const PetscInt *startj; 12508f7e8f9dSMark Adams if (col<myk) { // L 12518f7e8f9dSMark Adams PetscScalar *pLki = ba_d + bi_d[myk] + colkIdx(); 12528f7e8f9dSMark Adams PetscInt idx = (pLki+1) - (ba_d + bi_d[myk]); // index into row 12538f7e8f9dSMark Adams start = pLki+1; // start at pLki+1, A22(myk,1) 12548f7e8f9dSMark Adams startj= bj_d + bi_d[myk] + idx; 12558f7e8f9dSMark Adams end = ba_d + bi_d[myk+1]; 12568f7e8f9dSMark Adams } else { 12578f7e8f9dSMark Adams PetscInt idx = bdiag_d[myk+1]+1; 12588f7e8f9dSMark Adams start = ba_d + idx; 12598f7e8f9dSMark Adams startj= bj_d + idx; 12608f7e8f9dSMark Adams end = ba_d + bdiag_d[myk]; 12618f7e8f9dSMark Adams } 12628f7e8f9dSMark Adams // search for 'col', use bisection search - TODO 12638f7e8f9dSMark Adams low = 0; 12648f7e8f9dSMark Adams high = (PetscInt)(end-start); 12658f7e8f9dSMark Adams while (high-low > 5) { 12668f7e8f9dSMark Adams int t = (low+high)/2; 12678f7e8f9dSMark Adams if (startj[t] > col) high = t; 12688f7e8f9dSMark Adams else low = t; 12698f7e8f9dSMark Adams } 12708f7e8f9dSMark Adams for (pAkjv=start+low; pAkjv<start+high; pAkjv++) { 12718f7e8f9dSMark Adams if (startj[pAkjv-start] == col) break; 12728f7e8f9dSMark Adams } 12738f7e8f9dSMark Adams if (pAkjv==start+high) printf("\t\t\t\t\t\t\t\t\t\t\tERROR: *** failed to find Akj(%d,%d)\n",myk,col); 12748f7e8f9dSMark Adams *pAkjv = *pAkjv - L_ki() * Uij; // A_kj = A_kj - L_ki * U_ij 12758f7e8f9dSMark Adams } 12768f7e8f9dSMark Adams }); 12778f7e8f9dSMark Adams } 12788f7e8f9dSMark Adams } 12798f7e8f9dSMark Adams }); 12808f7e8f9dSMark Adams team.team_barrier(); // this needs to be a league barrier to use more that one SM per block 12818f7e8f9dSMark Adams if (field_block_idx==0) Kokkos::single(Kokkos::PerTeam(team), [&]() { Kokkos::atomic_add( flops.data(), (size_t)(2*(nzUi*nzUi)+2)); }); 12828f7e8f9dSMark Adams } /* endof for (i=0; i<n; i++) { */ 12838f7e8f9dSMark Adams Kokkos::single(Kokkos::PerTeam(team), [=]() { Kokkos::atomic_add( &d_flops_k(), flops()); flops() = 0; }); 12848f7e8f9dSMark Adams }); 12858f7e8f9dSMark Adams Kokkos::fence(); 12868f7e8f9dSMark Adams Kokkos::deep_copy (h_flops_k, d_flops_k); 12878f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 12888f7e8f9dSMark Adams ierr = PetscLogGpuFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 12898f7e8f9dSMark Adams #elif defined(PETSC_USE_LOG) 12908f7e8f9dSMark Adams ierr = PetscLogFlops((PetscLogDouble)h_flops_k());CHKERRQ(ierr); 12918f7e8f9dSMark Adams #endif 12928f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamPolicy<>(Nf*Ni, 1, 256), KOKKOS_LAMBDA (const team_member team) { 12938f7e8f9dSMark Adams const PetscInt lg_rank = team.league_rank(), field = lg_rank/Ni; //, field_offset = lg_rank%Ni; 12948f7e8f9dSMark Adams const PetscInt start = field*nloc, end = start + nloc, n_its = (nloc/Ni + !!(nloc%Ni)); // 1/Ni iters 12958f7e8f9dSMark Adams /* Invert diagonal for simpler triangular solves */ 12968f7e8f9dSMark Adams Kokkos::parallel_for(Kokkos::TeamVectorRange(team, n_its), [=] (int outer_index) { 12978f7e8f9dSMark Adams int i = start + outer_index*Ni + lg_rank%Ni; 12988f7e8f9dSMark Adams if (i < end) { 12998f7e8f9dSMark Adams PetscScalar *pv = ba_d + bdiag_d[i]; 13008f7e8f9dSMark Adams *pv = 1.0/(*pv); 13018f7e8f9dSMark Adams } 13028f7e8f9dSMark Adams }); 13038f7e8f9dSMark Adams }); 13048f7e8f9dSMark Adams } 13058f7e8f9dSMark Adams #if defined(PETSC_HAVE_DEVICE) && defined(PETSC_USE_LOG) 13068f7e8f9dSMark Adams ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 13078f7e8f9dSMark Adams #endif 13088f7e8f9dSMark Adams ierr = ISRestoreIndices(isicol,&ic_h);CHKERRQ(ierr); 13098f7e8f9dSMark Adams ierr = ISRestoreIndices(isrow,&r_h);CHKERRQ(ierr); 13108f7e8f9dSMark Adams 13118f7e8f9dSMark Adams ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 13128f7e8f9dSMark Adams ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 13138f7e8f9dSMark Adams if (b->inode.size) { 13148f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_Inode; 13158f7e8f9dSMark Adams } else if (row_identity && col_identity) { 13168f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ_NaturalOrdering; 13178f7e8f9dSMark Adams } else { 13188f7e8f9dSMark Adams B->ops->solve = MatSolve_SeqAIJ; // at least this needs to be in Kokkos 13198f7e8f9dSMark Adams } 13208f7e8f9dSMark Adams B->offloadmask = PETSC_OFFLOAD_GPU; 13218f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncHost(B);CHKERRQ(ierr); // solve on CPU 13228f7e8f9dSMark Adams B->ops->solveadd = MatSolveAdd_SeqAIJ; // and this 13238f7e8f9dSMark Adams B->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 13248f7e8f9dSMark Adams B->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 13258f7e8f9dSMark Adams B->ops->matsolve = MatMatSolve_SeqAIJ; 13268f7e8f9dSMark Adams B->assembled = PETSC_TRUE; 13278f7e8f9dSMark Adams B->preallocated = PETSC_TRUE; 13288f7e8f9dSMark Adams 1329930e68a5SMark Adams PetscFunctionReturn(0); 1330930e68a5SMark Adams } 1331930e68a5SMark Adams 133286a27549SJunchao Zhang static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 1333930e68a5SMark Adams { 1334930e68a5SMark Adams PetscErrorCode ierr; 1335930e68a5SMark Adams 1336930e68a5SMark Adams PetscFunctionBegin; 1337930e68a5SMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 133886a27549SJunchao Zhang B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos; 133986a27549SJunchao Zhang PetscFunctionReturn(0); 134086a27549SJunchao Zhang } 134186a27549SJunchao Zhang 134286a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosSymbolicSolveCheck(Mat A) 134386a27549SJunchao Zhang { 134486a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 134586a27549SJunchao Zhang 134686a27549SJunchao Zhang PetscFunctionBegin; 134786a27549SJunchao Zhang if (!factors->sptrsv_symbolic_completed) { 134886a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d); 134986a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d); 135086a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_TRUE; 135186a27549SJunchao Zhang } 135286a27549SJunchao Zhang PetscFunctionReturn(0); 135386a27549SJunchao Zhang } 135486a27549SJunchao Zhang 135586a27549SJunchao Zhang /* Check if we need to update factors etc for transpose solve */ 135686a27549SJunchao Zhang static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A) 135786a27549SJunchao Zhang { 135886a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 1359*076ba34aSJunchao Zhang MatColIdxType n = A->rmap->n; 136086a27549SJunchao Zhang 136186a27549SJunchao Zhang PetscFunctionBegin; 136286a27549SJunchao Zhang if (!factors->transpose_updated) { /* TODO: KK needs to provide functions to do numeric transpose only */ 136386a27549SJunchao Zhang /* Update L^T and do sptrsv symbolic */ 1364*076ba34aSJunchao Zhang factors->iLt_d = MatRowMapKokkosView("factors->iLt_d",n+1); 136586a27549SJunchao Zhang Kokkos::deep_copy(factors->iLt_d,0); /* KK requires 0 */ 1366*076ba34aSJunchao Zhang factors->jLt_d = MatColIdxKokkosView("factors->jLt_d",factors->jL_d.extent(0)); 1367*076ba34aSJunchao Zhang factors->aLt_d = MatScalarKokkosView("factors->aLt_d",factors->aL_d.extent(0)); 136886a27549SJunchao Zhang 136986a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1370*076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1371*076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1372*076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 137386a27549SJunchao Zhang n,n,factors->iL_d,factors->jL_d,factors->aL_d, 137486a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 137586a27549SJunchao Zhang 137686a27549SJunchao Zhang /* TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. 137786a27549SJunchao Zhang We have to sort the indices, until KK provides finer control options. 137886a27549SJunchao Zhang */ 1379*076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1380*076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 138186a27549SJunchao Zhang factors->iLt_d,factors->jLt_d,factors->aLt_d); 138286a27549SJunchao Zhang 138386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d); 138486a27549SJunchao Zhang 138586a27549SJunchao Zhang /* Update U^T and do sptrsv symbolic */ 1386*076ba34aSJunchao Zhang factors->iUt_d = MatRowMapKokkosView("factors->iUt_d",n+1); 138786a27549SJunchao Zhang Kokkos::deep_copy(factors->iUt_d,0); /* KK requires 0 */ 1388*076ba34aSJunchao Zhang factors->jUt_d = MatColIdxKokkosView("factors->jUt_d",factors->jU_d.extent(0)); 1389*076ba34aSJunchao Zhang factors->aUt_d = MatScalarKokkosView("factors->aUt_d",factors->aU_d.extent(0)); 139086a27549SJunchao Zhang 139186a27549SJunchao Zhang KokkosKernels::Impl::transpose_matrix< 1392*076ba34aSJunchao Zhang ConstMatRowMapKokkosView,ConstMatColIdxKokkosView,ConstMatScalarKokkosView, 1393*076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView, 1394*076ba34aSJunchao Zhang MatRowMapKokkosView,DefaultExecutionSpace>( 139586a27549SJunchao Zhang n,n,factors->iU_d, factors->jU_d, factors->aU_d, 139686a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 139786a27549SJunchao Zhang 139886a27549SJunchao Zhang /* Sort indices. See comments above */ 1399*076ba34aSJunchao Zhang KokkosKernels::sort_crs_matrix<DefaultExecutionSpace, 1400*076ba34aSJunchao Zhang MatRowMapKokkosView,MatColIdxKokkosView,MatScalarKokkosView>( 140186a27549SJunchao Zhang factors->iUt_d,factors->jUt_d,factors->aUt_d); 140286a27549SJunchao Zhang 140386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_symbolic(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d); 140486a27549SJunchao Zhang factors->transpose_updated = PETSC_TRUE; 140586a27549SJunchao Zhang } 140686a27549SJunchao Zhang PetscFunctionReturn(0); 140786a27549SJunchao Zhang } 140886a27549SJunchao Zhang 140986a27549SJunchao Zhang /* Solve Ax = b, with A = LU */ 141086a27549SJunchao Zhang static PetscErrorCode MatSolve_SeqAIJKokkos(Mat A,Vec b,Vec x) 141186a27549SJunchao Zhang { 141286a27549SJunchao Zhang PetscErrorCode ierr; 141386a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 141486a27549SJunchao Zhang PetscScalarKokkosView xv; 141586a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 141686a27549SJunchao Zhang 141786a27549SJunchao Zhang PetscFunctionBegin; 141886a27549SJunchao Zhang ierr = MatSeqAIJKokkosSymbolicSolveCheck(A);CHKERRQ(ierr); 141986a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 142086a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 142186a27549SJunchao Zhang /* Solve L tmpv = b */ 14223f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khL,factors->iL_d,factors->jL_d,factors->aL_d,bv,factors->workVector)); 142386a27549SJunchao Zhang /* Solve Ux = tmpv */ 14243f520e80SJunchao Zhang CHKERRCXX(KokkosSparse::Experimental::sptrsv_solve(&factors->khU,factors->iU_d,factors->jU_d,factors->aU_d,factors->workVector,xv)); 142586a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 142686a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 142786a27549SJunchao Zhang PetscFunctionReturn(0); 142886a27549SJunchao Zhang } 142986a27549SJunchao Zhang 1430*076ba34aSJunchao Zhang /* Solve A^T x = b, where A^T = U^T L^T */ 143186a27549SJunchao Zhang static PetscErrorCode MatSolveTranspose_SeqAIJKokkos(Mat A,Vec b,Vec x) 143286a27549SJunchao Zhang { 143386a27549SJunchao Zhang PetscErrorCode ierr; 143486a27549SJunchao Zhang ConstPetscScalarKokkosView bv; 143586a27549SJunchao Zhang PetscScalarKokkosView xv; 143686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)A->spptr; 143786a27549SJunchao Zhang 143886a27549SJunchao Zhang PetscFunctionBegin; 143986a27549SJunchao Zhang ierr = MatSeqAIJKokkosTransposeSolveCheck(A);CHKERRQ(ierr); 144086a27549SJunchao Zhang ierr = VecGetKokkosView(x,&xv);CHKERRQ(ierr); 144186a27549SJunchao Zhang ierr = VecGetKokkosView(b,&bv);CHKERRQ(ierr); 144286a27549SJunchao Zhang /* Solve U^T tmpv = b */ 144386a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khUt,factors->iUt_d,factors->jUt_d,factors->aUt_d,bv,factors->workVector); 144486a27549SJunchao Zhang 144586a27549SJunchao Zhang /* Solve L^T x = tmpv */ 144686a27549SJunchao Zhang KokkosSparse::Experimental::sptrsv_solve(&factors->khLt,factors->iLt_d,factors->jLt_d,factors->aLt_d,factors->workVector,xv); 144786a27549SJunchao Zhang ierr = VecRestoreKokkosView(x,&xv);CHKERRQ(ierr); 144886a27549SJunchao Zhang ierr = VecRestoreKokkosView(b,&bv);CHKERRQ(ierr); 144986a27549SJunchao Zhang PetscFunctionReturn(0); 145086a27549SJunchao Zhang } 145186a27549SJunchao Zhang 145286a27549SJunchao Zhang static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo *info) 145386a27549SJunchao Zhang { 145486a27549SJunchao Zhang PetscErrorCode ierr; 145586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos*)A->spptr; 145686a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 145786a27549SJunchao Zhang PetscInt fill_lev = info->levels; 145886a27549SJunchao Zhang 145986a27549SJunchao Zhang PetscFunctionBegin; 146086a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 1461*076ba34aSJunchao Zhang 1462*076ba34aSJunchao Zhang auto a_d = aijkok->a_dual.view_device(); 1463*076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1464*076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1465*076ba34aSJunchao Zhang 1466*076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_numeric(&factors->kh,fill_lev,i_d,j_d,a_d,factors->iL_d,factors->jL_d,factors->aL_d,factors->iU_d,factors->jU_d,factors->aU_d); 146786a27549SJunchao Zhang 146886a27549SJunchao Zhang B->assembled = PETSC_TRUE; 146986a27549SJunchao Zhang B->preallocated = PETSC_TRUE; 147086a27549SJunchao Zhang B->ops->solve = MatSolve_SeqAIJKokkos; 147186a27549SJunchao Zhang B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos; 147286a27549SJunchao Zhang B->ops->matsolve = NULL; 147386a27549SJunchao Zhang B->ops->matsolvetranspose = NULL; 147486a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 147586a27549SJunchao Zhang 147686a27549SJunchao Zhang /* Once the factors' value changed, we need to update their transpose and sptrsv handle */ 147786a27549SJunchao Zhang factors->transpose_updated = PETSC_FALSE; 147886a27549SJunchao Zhang factors->sptrsv_symbolic_completed = PETSC_FALSE; 147986a27549SJunchao Zhang PetscFunctionReturn(0); 148086a27549SJunchao Zhang } 148186a27549SJunchao Zhang 148286a27549SJunchao Zhang static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 148386a27549SJunchao Zhang { 148486a27549SJunchao Zhang PetscErrorCode ierr; 148586a27549SJunchao Zhang Mat_SeqAIJKokkos *aijkok; 148686a27549SJunchao Zhang Mat_SeqAIJ *b; 148786a27549SJunchao Zhang Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors*)B->spptr; 148886a27549SJunchao Zhang PetscInt fill_lev = info->levels; 148986a27549SJunchao Zhang PetscInt nnzA = ((Mat_SeqAIJ*)A->data)->nz,nnzL,nnzU; 149086a27549SJunchao Zhang PetscInt n = A->rmap->n; 149186a27549SJunchao Zhang 149286a27549SJunchao Zhang PetscFunctionBegin; 149386a27549SJunchao Zhang ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); 149486a27549SJunchao Zhang /* Rebuild factors */ 149586a27549SJunchao Zhang if (factors) {factors->Destroy();} /* Destroy the old if it exists */ 149686a27549SJunchao Zhang else {B->spptr = factors = new Mat_SeqAIJKokkosTriFactors(n);} 149786a27549SJunchao Zhang 149886a27549SJunchao Zhang /* Create a spiluk handle and then do symbolic factorization */ 149986a27549SJunchao Zhang nnzL = nnzU = PetscRealIntMultTruncate(info->fill,nnzA); 150086a27549SJunchao Zhang factors->kh.create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,n,nnzL,nnzU); 150186a27549SJunchao Zhang 150286a27549SJunchao Zhang auto spiluk_handle = factors->kh.get_spiluk_handle(); 150386a27549SJunchao Zhang 150486a27549SJunchao Zhang Kokkos::realloc(factors->iL_d,n+1); /* Free old arrays and realloc */ 150586a27549SJunchao Zhang Kokkos::realloc(factors->jL_d,spiluk_handle->get_nnzL()); 150686a27549SJunchao Zhang Kokkos::realloc(factors->iU_d,n+1); 150786a27549SJunchao Zhang Kokkos::realloc(factors->jU_d,spiluk_handle->get_nnzU()); 150886a27549SJunchao Zhang 150986a27549SJunchao Zhang aijkok = (Mat_SeqAIJKokkos*)A->spptr; 1510*076ba34aSJunchao Zhang auto i_d = aijkok->i_dual.view_device(); 1511*076ba34aSJunchao Zhang auto j_d = aijkok->j_dual.view_device(); 1512*076ba34aSJunchao Zhang KokkosSparse::Experimental::spiluk_symbolic(&factors->kh,fill_lev,i_d,j_d,factors->iL_d,factors->jL_d,factors->iU_d,factors->jU_d); 151386a27549SJunchao Zhang /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */ 151486a27549SJunchao Zhang 151586a27549SJunchao Zhang Kokkos::resize (factors->jL_d,spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */ 151686a27549SJunchao Zhang Kokkos::resize (factors->jU_d,spiluk_handle->get_nnzU()); 151786a27549SJunchao Zhang Kokkos::realloc(factors->aL_d,spiluk_handle->get_nnzL()); /* No need to retain old value */ 151886a27549SJunchao Zhang Kokkos::realloc(factors->aU_d,spiluk_handle->get_nnzU()); 151986a27549SJunchao Zhang 152086a27549SJunchao Zhang /* TODO: add options to select sptrsv algorithms */ 152186a27549SJunchao Zhang /* Create sptrsv handles for L, U and their transpose */ 152286a27549SJunchao Zhang #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) 152386a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE; 152486a27549SJunchao Zhang #else 152586a27549SJunchao Zhang auto sptrsv_alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1; 152686a27549SJunchao Zhang #endif 152786a27549SJunchao Zhang 152886a27549SJunchao Zhang factors->khL.create_sptrsv_handle(sptrsv_alg,n,true/* L is lower tri */); 152986a27549SJunchao Zhang factors->khU.create_sptrsv_handle(sptrsv_alg,n,false/* U is not lower tri */); 153086a27549SJunchao Zhang factors->khLt.create_sptrsv_handle(sptrsv_alg,n,false/* L^T is not lower tri */); 153186a27549SJunchao Zhang factors->khUt.create_sptrsv_handle(sptrsv_alg,n,true/* U^T is lower tri */); 153286a27549SJunchao Zhang 153386a27549SJunchao Zhang /* Fill fields of the factor matrix B */ 153486a27549SJunchao Zhang ierr = MatSeqAIJSetPreallocation_SeqAIJ(B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 153586a27549SJunchao Zhang b = (Mat_SeqAIJ*)B->data; 153686a27549SJunchao Zhang b->nz = b->maxnz = spiluk_handle->get_nnzL()+spiluk_handle->get_nnzU(); 153786a27549SJunchao Zhang B->info.fill_ratio_given = info->fill; 153886a27549SJunchao Zhang B->info.fill_ratio_needed = ((PetscReal)b->nz)/((PetscReal)nnzA); 153986a27549SJunchao Zhang 154086a27549SJunchao Zhang B->offloadmask = PETSC_OFFLOAD_GPU; 154186a27549SJunchao Zhang B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos; 1542930e68a5SMark Adams PetscFunctionReturn(0); 1543930e68a5SMark Adams } 1544930e68a5SMark Adams 15458f7e8f9dSMark Adams static PetscErrorCode MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 15468f7e8f9dSMark Adams { 15478f7e8f9dSMark Adams PetscErrorCode ierr; 15488f7e8f9dSMark Adams Mat_SeqAIJ *b=(Mat_SeqAIJ*)B->data; 15498f7e8f9dSMark Adams const PetscInt nrows = A->rmap->n; 1550930e68a5SMark Adams 15518f7e8f9dSMark Adams PetscFunctionBegin; 15528f7e8f9dSMark Adams ierr = MatLUFactorSymbolic_SeqAIJ(B,A,isrow,iscol,info);CHKERRQ(ierr); 15538f7e8f9dSMark Adams B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKOKKOSDEVICE; 15548f7e8f9dSMark Adams // move B data into Kokkos 15558f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(B);CHKERRQ(ierr); // create aijkok 15568f7e8f9dSMark Adams ierr = MatSeqAIJKokkosSyncDevice(A);CHKERRQ(ierr); // create aijkok 15578f7e8f9dSMark Adams { 15588f7e8f9dSMark Adams Mat_SeqAIJKokkos *baijkok = static_cast<Mat_SeqAIJKokkos*>(B->spptr); 15598f7e8f9dSMark Adams if (!baijkok->diag_d) { 15608f7e8f9dSMark Adams const Kokkos::View<PetscInt*, Kokkos::HostSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > h_diag (b->diag,nrows+1); 1561152b3e56SJunchao Zhang baijkok->diag_d = new Kokkos::View<PetscInt*>(Kokkos::create_mirror(DefaultMemorySpace(),h_diag)); 15628f7e8f9dSMark Adams Kokkos::deep_copy (*baijkok->diag_d, h_diag); 15638f7e8f9dSMark Adams } 15648f7e8f9dSMark Adams } 15658f7e8f9dSMark Adams PetscFunctionReturn(0); 15668f7e8f9dSMark Adams } 15678f7e8f9dSMark Adams 156886a27549SJunchao Zhang static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos(Mat A,MatSolverType *type) 1569930e68a5SMark Adams { 1570930e68a5SMark Adams PetscFunctionBegin; 1571930e68a5SMark Adams *type = MATSOLVERKOKKOS; 1572930e68a5SMark Adams PetscFunctionReturn(0); 1573930e68a5SMark Adams } 1574930e68a5SMark Adams 15758f7e8f9dSMark Adams static PetscErrorCode MatFactorGetSolverType_seqaij_kokkos_device(Mat A,MatSolverType *type) 15768f7e8f9dSMark Adams { 15778f7e8f9dSMark Adams PetscFunctionBegin; 15788f7e8f9dSMark Adams *type = MATSOLVERKOKKOSDEVICE; 15798f7e8f9dSMark Adams PetscFunctionReturn(0); 15808f7e8f9dSMark Adams } 15818f7e8f9dSMark Adams 1582930e68a5SMark Adams /*MC 158386a27549SJunchao Zhang MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices 158486a27549SJunchao Zhang on a single GPU of type, SeqAIJKokkos, AIJKokkos. 1585930e68a5SMark Adams 1586930e68a5SMark Adams Level: beginner 1587930e68a5SMark Adams 158886a27549SJunchao Zhang .seealso: PCFactorSetMatSolverType(), MatSolverType, MatCreateSeqAIJKokkos(), MATAIJKOKKOS, MatCreateAIJKokkos(), MatKokkosSetFormat(), MatKokkosStorageFormat, MatKokkosFormatOperation 1589930e68a5SMark Adams M*/ 159086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */ 1591930e68a5SMark Adams { 1592930e68a5SMark Adams PetscErrorCode ierr; 1593930e68a5SMark Adams PetscInt n = A->rmap->n; 1594930e68a5SMark Adams 1595930e68a5SMark Adams PetscFunctionBegin; 1596930e68a5SMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 1597930e68a5SMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 1598930e68a5SMark Adams (*B)->factortype = ftype; 15994ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 1600930e68a5SMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 1601930e68a5SMark Adams 16028f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 1603930e68a5SMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 160486a27549SJunchao Zhang (*B)->canuseordering = PETSC_TRUE; 160586a27549SJunchao Zhang (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos; 160686a27549SJunchao Zhang } else if (ftype == MAT_FACTOR_ILU) { 160786a27549SJunchao Zhang ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 160886a27549SJunchao Zhang (*B)->canuseordering = PETSC_FALSE; 160986a27549SJunchao Zhang (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos; 161086a27549SJunchao Zhang } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]); 1611930e68a5SMark Adams 1612930e68a5SMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 161386a27549SJunchao Zhang ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_SeqAIJKokkos);CHKERRQ(ierr); 1614930e68a5SMark Adams PetscFunctionReturn(0); 1615930e68a5SMark Adams } 16168f7e8f9dSMark Adams 16178f7e8f9dSMark Adams PETSC_EXTERN PetscErrorCode MatGetFactor_seqaijkokkos_kokkos_device(Mat A,MatFactorType ftype,Mat *B) 16188f7e8f9dSMark Adams { 16198f7e8f9dSMark Adams PetscErrorCode ierr; 16208f7e8f9dSMark Adams PetscInt n = A->rmap->n; 16218f7e8f9dSMark Adams 16228f7e8f9dSMark Adams PetscFunctionBegin; 16238f7e8f9dSMark Adams ierr = MatCreate(PetscObjectComm((PetscObject)A),B);CHKERRQ(ierr); 16248f7e8f9dSMark Adams ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 16258f7e8f9dSMark Adams (*B)->factortype = ftype; 1626f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 16274ac6704cSBarry Smith ierr = PetscStrallocpy(MATORDERINGND,(char**)&(*B)->preferredordering[MAT_FACTOR_LU]);CHKERRQ(ierr); 16288f7e8f9dSMark Adams ierr = MatSetType(*B,MATSEQAIJKOKKOS);CHKERRQ(ierr); 16298f7e8f9dSMark Adams 16308f7e8f9dSMark Adams if (ftype == MAT_FACTOR_LU) { 16318f7e8f9dSMark Adams ierr = MatSetBlockSizesFromMats(*B,A,A);CHKERRQ(ierr); 16328f7e8f9dSMark Adams (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKOKKOSDEVICE; 16338f7e8f9dSMark Adams } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Factor type not supported for KOKKOS Matrix Types"); 16348f7e8f9dSMark Adams 16358f7e8f9dSMark Adams ierr = MatSeqAIJSetPreallocation(*B,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 16368f7e8f9dSMark Adams ierr = PetscObjectComposeFunction((PetscObject)(*B),"MatFactorGetSolverType_C",MatFactorGetSolverType_seqaij_kokkos_device);CHKERRQ(ierr); 16378f7e8f9dSMark Adams PetscFunctionReturn(0); 16388f7e8f9dSMark Adams } 163986a27549SJunchao Zhang 164086a27549SJunchao Zhang PETSC_EXTERN PetscErrorCode MatSolverTypeRegister_KOKKOS(void) 164186a27549SJunchao Zhang { 164286a27549SJunchao Zhang PetscErrorCode ierr; 164386a27549SJunchao Zhang 164486a27549SJunchao Zhang PetscFunctionBegin; 164586a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 164686a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOS,MATSEQAIJKOKKOS,MAT_FACTOR_ILU,MatGetFactor_SeqAIJKokkos_Kokkos);CHKERRQ(ierr); 164786a27549SJunchao Zhang ierr = MatSolverTypeRegister(MATSOLVERKOKKOSDEVICE,MATSEQAIJKOKKOS,MAT_FACTOR_LU,MatGetFactor_seqaijkokkos_kokkos_device);CHKERRQ(ierr); 164886a27549SJunchao Zhang PetscFunctionReturn(0); 164986a27549SJunchao Zhang } 165086a27549SJunchao Zhang 1651*076ba34aSJunchao Zhang /* Utility to print out a KokkosCsrMatrix for debugging */ 1652*076ba34aSJunchao Zhang PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix& csrmat) 1653*076ba34aSJunchao Zhang { 1654*076ba34aSJunchao Zhang PetscErrorCode ierr; 1655*076ba34aSJunchao Zhang const auto& iv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.row_map); 1656*076ba34aSJunchao Zhang const auto& jv = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.graph.entries); 1657*076ba34aSJunchao Zhang const auto& av = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),csrmat.values); 1658*076ba34aSJunchao Zhang const PetscInt *i = iv.data(); 1659*076ba34aSJunchao Zhang const PetscInt *j = jv.data(); 1660*076ba34aSJunchao Zhang const PetscScalar *a = av.data(); 1661*076ba34aSJunchao Zhang PetscInt m = csrmat.numRows(),n = csrmat.numCols(),nnz = csrmat.nnz(); 1662*076ba34aSJunchao Zhang 1663*076ba34aSJunchao Zhang PetscFunctionBegin; 1664*076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D x %D SeqAIJKokkos, with %D nonzeros\n",m,n,nnz);CHKERRQ(ierr); 1665*076ba34aSJunchao Zhang for (PetscInt k=0; k<m; k++) { 1666*076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D: ",k);CHKERRQ(ierr); 1667*076ba34aSJunchao Zhang for (PetscInt p=i[k]; p<i[k+1]; p++) { 1668*076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"%D(%.1f), ",j[p],a[p]);CHKERRQ(ierr); 1669*076ba34aSJunchao Zhang } 1670*076ba34aSJunchao Zhang ierr = PetscPrintf(PETSC_COMM_SELF,"\n");CHKERRQ(ierr); 1671*076ba34aSJunchao Zhang } 1672*076ba34aSJunchao Zhang PetscFunctionReturn(0); 1673*076ba34aSJunchao Zhang } 1674